Merge branch 'main' into litellm_sagemaker_fix_stream

2025-04-27 03:34:10 +00:00 · 2025-03-31 14:22:20 -07:00 · 2025-03-31 14:22:20 -07:00 · 83ba96b8c6
commit 83ba96b8c6
parent 12639b7ccf ce5f55d04e
452 changed files with 13927 additions and 3613 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -3,6 +3,18 @@ orbs:
  codecov: codecov/codecov@4.0.1
  node: circleci/node@5.1.0  # Add this line to declare the node orb

+commands:
+  setup_google_dns:
+    steps:
+      - run:
+          name: "Configure Google DNS"
+          command: |
+            # Backup original resolv.conf
+            sudo cp /etc/resolv.conf /etc/resolv.conf.backup
+            # Add both local and Google DNS servers
+            echo "nameserver 127.0.0.11" | sudo tee /etc/resolv.conf
+            echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf
+            echo "nameserver 8.8.4.4" | sudo tee -a /etc/resolv.conf

 jobs:
  local_testing:
@ -15,7 +27,7 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -134,7 +146,7 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -234,7 +246,13 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
+      - run:
+          name: DNS lookup for Redis host
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y dnsutils
+            dig redis-19899.c239.us-east-1-2.ec2.redns.redis-cloud.com +short
      - run:
          name: Show git commit hash
          command: |
@ -334,6 +352,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -388,6 +407,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -429,6 +449,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -479,7 +500,13 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
-
+      - run:
+          name: Install PostgreSQL
+          command: |
+            sudo apt-get update
+            sudo apt-get install postgresql postgresql-contrib
+            echo 'export PATH=/usr/lib/postgresql/*/bin:$PATH' >> $BASH_ENV
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -534,6 +561,7 @@ jobs:
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "pytest-postgresql==7.0.1"
      - save_cache:
          paths:
            - ./venv
@ -569,7 +597,7 @@ jobs:
            - litellm_proxy_unit_tests_coverage
  litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword
    docker:
-      - image: cimg/python:3.11
+      - image: cimg/python:3.13.1
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
@ -577,6 +605,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -618,6 +647,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -654,6 +684,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -696,6 +727,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -740,6 +772,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -782,6 +815,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -828,6 +862,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -872,6 +907,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -918,6 +954,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -960,6 +997,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1002,6 +1040,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1048,6 +1087,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1080,6 +1120,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1104,6 +1145,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      # Install Helm
      - run:
          name: Install Helm
@ -1173,6 +1215,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1209,6 +1252,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Python 3.9
          command: |
@ -1283,6 +1327,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1418,6 +1463,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1542,6 +1588,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1704,6 +1751,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1815,6 +1863,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1897,6 +1946,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      # Remove Docker CLI installation since it's already available in machine executor
      - run:
          name: Install Python 3.13
@ -1994,6 +2044,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -2039,6 +2090,8 @@ jobs:
            pip install "google-cloud-aiplatform==1.59.0"
            pip install "anthropic==0.49.0"
            pip install "langchain_mcp_adapters==0.0.5"
+            pip install "langchain_openai==0.2.1"
+            pip install "langgraph==0.3.18"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -2251,6 +2304,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build UI
          command: |
@ -2365,6 +2419,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build Docker image
          command: |
@ -2387,6 +2442,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build Docker image
          command: |
--- a/.github/workflows/publish-migrations.yml
+++ b/.github/workflows/publish-migrations.yml
@ -0,0 +1,206 @@
+name: Publish Prisma Migrations
+
+permissions:
+  contents: write
+  pull-requests: write
+
+on:
+  push:
+    paths:
+      - 'schema.prisma'  # Check root schema.prisma
+    branches:
+      - main
+
+jobs:
+  publish-migrations:
+    runs-on: ubuntu-latest
+    services:
+      postgres:
+        image: postgres:14
+        env:
+          POSTGRES_DB: temp_db
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+      
+      # Add shadow database service
+      postgres_shadow:
+        image: postgres:14
+        env:
+          POSTGRES_DB: shadow_db
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5433:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Install Dependencies
+        run: |
+          pip install prisma
+          pip install python-dotenv
+
+      - name: Generate Initial Migration if None Exists
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          mkdir -p deploy/migrations
+          echo 'provider = "postgresql"' > deploy/migrations/migration_lock.toml
+          
+          if [ -z "$(ls -A deploy/migrations/2* 2>/dev/null)" ]; then
+            echo "No existing migrations found, creating baseline..."
+            VERSION=$(date +%Y%m%d%H%M%S)
+            mkdir -p deploy/migrations/${VERSION}_initial
+            
+            echo "Generating initial migration..."
+            # Save raw output for debugging
+            prisma migrate diff \
+              --from-empty \
+              --to-schema-datamodel schema.prisma \
+              --shadow-database-url "${SHADOW_DATABASE_URL}" \
+              --script > deploy/migrations/${VERSION}_initial/raw_migration.sql
+            
+            echo "Raw migration file content:"
+            cat deploy/migrations/${VERSION}_initial/raw_migration.sql
+            
+            echo "Cleaning migration file..."
+            # Clean the file
+            sed '/^Installing/d' deploy/migrations/${VERSION}_initial/raw_migration.sql > deploy/migrations/${VERSION}_initial/migration.sql
+            
+            # Verify the migration file
+            if [ ! -s deploy/migrations/${VERSION}_initial/migration.sql ]; then
+              echo "ERROR: Migration file is empty after cleaning"
+              echo "Original content was:"
+              cat deploy/migrations/${VERSION}_initial/raw_migration.sql
+              exit 1
+            fi
+            
+            echo "Final migration file content:"
+            cat deploy/migrations/${VERSION}_initial/migration.sql
+            
+            # Verify it starts with SQL
+            if ! head -n 1 deploy/migrations/${VERSION}_initial/migration.sql | grep -q "^--\|^CREATE\|^ALTER"; then
+              echo "ERROR: Migration file does not start with SQL command or comment"
+              echo "First line is:"
+              head -n 1 deploy/migrations/${VERSION}_initial/migration.sql
+              echo "Full content is:"
+              cat deploy/migrations/${VERSION}_initial/migration.sql
+              exit 1
+            fi
+            
+            echo "Initial migration generated at $(date -u)" > deploy/migrations/${VERSION}_initial/README.md
+          fi
+
+      - name: Compare and Generate Migration
+        if: success()
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          # Create temporary migration workspace
+          mkdir -p temp_migrations
+          
+          # Copy existing migrations (will not fail if directory is empty)
+          cp -r deploy/migrations/* temp_migrations/ 2>/dev/null || true
+          
+          VERSION=$(date +%Y%m%d%H%M%S)
+          
+          # Generate diff against existing migrations or empty state
+          prisma migrate diff \
+            --from-migrations temp_migrations \
+            --to-schema-datamodel schema.prisma \
+            --shadow-database-url "${SHADOW_DATABASE_URL}" \
+            --script > temp_migrations/migration_${VERSION}.sql
+          
+          # Check if there are actual changes
+          if [ -s temp_migrations/migration_${VERSION}.sql ]; then
+            echo "Changes detected, creating new migration"
+            mkdir -p deploy/migrations/${VERSION}_schema_update
+            mv temp_migrations/migration_${VERSION}.sql deploy/migrations/${VERSION}_schema_update/migration.sql
+            echo "Migration generated at $(date -u)" > deploy/migrations/${VERSION}_schema_update/README.md
+          else
+            echo "No schema changes detected"
+            exit 0
+          fi
+
+      - name: Verify Migration
+        if: success()
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          # Create test database
+          psql "${SHADOW_DATABASE_URL}" -c 'CREATE DATABASE migration_test;'
+          
+          # Apply all migrations in order to verify
+          for migration in deploy/migrations/*/migration.sql; do
+            echo "Applying migration: $migration"
+            psql "${SHADOW_DATABASE_URL}" -f $migration
+          done
+
+      # Add this step before create-pull-request to debug permissions
+      - name: Check Token Permissions
+        run: |
+          echo "Checking token permissions..."
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+               -H "Accept: application/vnd.github.v3+json" \
+               https://api.github.com/repos/BerriAI/litellm/collaborators
+          
+          echo "\nChecking if token can create PRs..."
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+               -H "Accept: application/vnd.github.v3+json" \
+               https://api.github.com/repos/BerriAI/litellm
+
+      # Add this debug step before git push
+      - name: Debug Changed Files
+        run: |
+          echo "Files staged for commit:"
+          git diff --name-status --staged
+          
+          echo "\nAll changed files:"
+          git status
+
+      - name: Create Pull Request
+        if: success()
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: "chore: update prisma migrations"
+          title: "Update Prisma Migrations"
+          body: |
+            Auto-generated migration based on schema.prisma changes.
+            
+            Generated files:
+            - deploy/migrations/${VERSION}_schema_update/migration.sql
+            - deploy/migrations/${VERSION}_schema_update/README.md
+          branch: feat/prisma-migration-${{ env.VERSION }}
+          base: main
+          delete-branch: true
+
+      - name: Generate and Save Migrations
+        run: |
+          # Only add migration files
+          git add deploy/migrations/
+          git status  # Debug what's being committed
+          git commit -m "chore: update prisma migrations" 
--- a/.github/workflows/test-linting.yml
+++ b/.github/workflows/test-linting.yml
@ -0,0 +1,53 @@
+name: LiteLLM Linting
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Install dependencies
+      run: |
+        poetry install --with dev
+
+    - name: Run Black formatting check
+      run: |
+        cd litellm
+        poetry run black . --check
+        cd ..
+
+    - name: Run Ruff linting
+      run: |
+        cd litellm
+        poetry run ruff check .
+        cd ..
+
+    - name: Run MyPy type checking
+      run: |
+        cd litellm
+        poetry run mypy . --ignore-missing-imports
+        cd ..
+
+    - name: Check for circular imports
+      run: |
+        cd litellm
+        poetry run python ../tests/documentation_tests/test_circular_imports.py
+        cd ..
+
+    - name: Check import safety
+      run: |
+        poetry run python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
--- a/.github/workflows/test-litellm.yml
+++ b/.github/workflows/test-litellm.yml
@ -0,0 +1,35 @@
+name: LiteLLM Mock Tests (folder - tests/litellm)
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Thank You Message
+      run: |
+        echo "### 🙏 Thank you for contributing to LiteLLM!" >> $GITHUB_STEP_SUMMARY
+        echo "Your PR is being tested now. We appreciate your help in making LiteLLM better!" >> $GITHUB_STEP_SUMMARY
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Install dependencies
+      run: |
+        poetry install --with dev,proxy-dev --extras proxy
+        poetry run pip install pytest-xdist
+
+    - name: Run tests
+      run: |
+        poetry run pytest tests/litellm -x -vv -n 4 
--- a/.gitignore
+++ b/.gitignore
@ -83,4 +83,5 @@ tests/llm_translation/test_vertex_key.json
 litellm/proxy/migrations/0_init/migration.sql
 litellm/proxy/db/migrations/0_init/migration.sql
 litellm/proxy/db/migrations/*
+litellm/proxy/migrations/*config.yaml
 litellm/proxy/migrations/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,44 +6,35 @@ repos:
        entry: pyright
        language: system
        types: [python]
-        files: ^litellm/
+        files: ^(litellm/|litellm_proxy_extras/)
    -   id: isort
        name: isort
        entry: isort
        language: system
        types: [python]
-        files: litellm/.*\.py
+        files: (litellm/|litellm_proxy_extras/).*\.py
        exclude: ^litellm/__init__.py$
-   repo: https://github.com/psf/black
-    rev: 24.2.0
-    hooks:
    -   id: black
+        name: black
+        entry: poetry run black
+        language: system
+        types: [python]
+        files: (litellm/|litellm_proxy_extras/).*\.py
 -   repo: https://github.com/pycqa/flake8
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
       exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/
       additional_dependencies: [flake8-print]
-       files: litellm/.*\.py
-    # -  id: flake8
-    #    name: flake8 (router.py function length)
-    #    files: ^litellm/router\.py$
-    #    args: [--max-function-length=40]
-    # #    additional_dependencies: [flake8-functions]
+       files: (litellm/|litellm_proxy_extras/).*\.py
 -   repo: https://github.com/python-poetry/poetry
    rev: 1.8.0
    hooks:
      - id: poetry-check
+        files: ^(pyproject.toml|litellm-proxy-extras/pyproject.toml)$
 -   repo: local
    hooks:
    -   id: check-files-match
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
        language: system
-    # -   id: check-file-length
-    #     name: Check file length
-    #     entry: python check_file_length.py
-    #     args: ["10000"]  # set your desired maximum number of lines
-    #     language: python
-    #     files: litellm/.*\.py
-    #     exclude: ^litellm/tests/
--- a/3
+++ b/3
@ -14,6 +14,9 @@ help:
 install-dev:
 	poetry install --with dev

+install-proxy-dev:
+	poetry install --with dev,proxy-dev
+
 lint: install-dev
 	poetry run pip install types-requests types-setuptools types-redis types-PyYAML
 	cd litellm && poetry run mypy . --ignore-missing-imports
--- a/README.md
+++ b/README.md
@ -16,9 +16,6 @@
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
    </a>
-    <a href="https://dl.circleci.com/status-badge/redirect/gh/BerriAI/litellm/tree/main" target="_blank">
-        <img src="https://dl.circleci.com/status-badge/img/gh/BerriAI/litellm/tree/main.svg?style=svg" alt="CircleCI">
-    </a>
    <a href="https://www.ycombinator.com/companies/berriai">
        <img src="https://img.shields.io/badge/Y%20Combinator-W23-orange?style=flat-square" alt="Y Combinator W23">
    </a>
--- a/ci_cd/baseline_db.py
+++ b/ci_cd/baseline_db.py
@ -0,0 +1,60 @@
+import subprocess
+from pathlib import Path
+from datetime import datetime
+
+
+def create_baseline():
+    """Create baseline migration in deploy/migrations"""
+    try:
+        # Get paths
+        root_dir = Path(__file__).parent.parent
+        deploy_dir = root_dir / "deploy"
+        migrations_dir = deploy_dir / "migrations"
+        schema_path = root_dir / "schema.prisma"
+
+        # Create migrations directory
+        migrations_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create migration_lock.toml if it doesn't exist
+        lock_file = migrations_dir / "migration_lock.toml"
+        if not lock_file.exists():
+            lock_file.write_text('provider = "postgresql"\n')
+
+        # Create timestamp-based migration directory
+        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+        migration_dir = migrations_dir / f"{timestamp}_baseline"
+        migration_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate migration SQL
+        result = subprocess.run(
+            [
+                "prisma",
+                "migrate",
+                "diff",
+                "--from-empty",
+                "--to-schema-datamodel",
+                str(schema_path),
+                "--script",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        # Write the SQL to migration.sql
+        migration_file = migration_dir / "migration.sql"
+        migration_file.write_text(result.stdout)
+
+        print(f"Created baseline migration in {migration_dir}")
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error running prisma command: {e.stderr}")
+        return False
+    except Exception as e:
+        print(f"Error creating baseline migration: {str(e)}")
+        return False
+
+
+if __name__ == "__main__":
+    create_baseline()
--- a/ci_cd/publish-proxy-extras.sh
+++ b/ci_cd/publish-proxy-extras.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+echo "🚀 Building and publishing litellm-proxy-extras"
+
+# Navigate to litellm-proxy-extras directory
+cd "$(dirname "$0")/../litellm-proxy-extras"
+
+# Build the package
+echo "📦 Building package..."
+poetry build
+
+# Publish to PyPI
+echo "🌎 Publishing to PyPI..."
+poetry publish
+
+echo "✅ Done! Package published successfully"
--- a/ci_cd/run_migration.py
+++ b/ci_cd/run_migration.py
@ -0,0 +1,95 @@
+import os
+import subprocess
+from pathlib import Path
+from datetime import datetime
+import testing.postgresql
+import shutil
+
+
+def create_migration(migration_name: str = None):
+    """
+    Create a new migration SQL file in the migrations directory by comparing
+    current database state with schema
+
+    Args:
+        migration_name (str): Name for the migration
+    """
+    try:
+        # Get paths
+        root_dir = Path(__file__).parent.parent
+        migrations_dir = root_dir / "litellm-proxy-extras" / "litellm_proxy_extras" / "migrations"
+        schema_path = root_dir / "schema.prisma"
+
+        # Create temporary PostgreSQL database
+        with testing.postgresql.Postgresql() as postgresql:
+            db_url = postgresql.url()
+
+            # Create temporary migrations directory next to schema.prisma
+            temp_migrations_dir = schema_path.parent / "migrations"
+
+            try:
+                # Copy existing migrations to temp directory
+                if temp_migrations_dir.exists():
+                    shutil.rmtree(temp_migrations_dir)
+                shutil.copytree(migrations_dir, temp_migrations_dir)
+
+                # Apply existing migrations to temp database
+                os.environ["DATABASE_URL"] = db_url
+                subprocess.run(
+                    ["prisma", "migrate", "deploy", "--schema", str(schema_path)],
+                    check=True,
+                )
+
+                # Generate diff between current database and schema
+                result = subprocess.run(
+                    [
+                        "prisma",
+                        "migrate",
+                        "diff",
+                        "--from-url",
+                        db_url,
+                        "--to-schema-datamodel",
+                        str(schema_path),
+                        "--script",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+
+                if result.stdout.strip():
+                    # Generate timestamp and create migration directory
+                    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+                    migration_name = migration_name or "unnamed_migration"
+                    migration_dir = migrations_dir / f"{timestamp}_{migration_name}"
+                    migration_dir.mkdir(parents=True, exist_ok=True)
+
+                    # Write the SQL to migration.sql
+                    migration_file = migration_dir / "migration.sql"
+                    migration_file.write_text(result.stdout)
+
+                    print(f"Created migration in {migration_dir}")
+                    return True
+                else:
+                    print("No schema changes detected. Migration not needed.")
+                    return False
+
+            finally:
+                # Clean up: remove temporary migrations directory
+                if temp_migrations_dir.exists():
+                    shutil.rmtree(temp_migrations_dir)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating migration: {e.stderr}")
+        return False
+    except Exception as e:
+        print(f"Error creating migration: {str(e)}")
+        return False
+
+
+if __name__ == "__main__":
+    # If running directly, can optionally pass migration name as argument
+    import sys
+
+    migration_name = sys.argv[1] if len(sys.argv) > 1 else None
+    create_migration(migration_name)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,5 +1,35 @@
 version: "3.11"
 services:
+  litellm:
+    build:
+      context: .
+      args:
+        target: runtime
+    image: ghcr.io/berriai/litellm:main-stable
+    #########################################
+    ## Uncomment these lines to start proxy with a config.yaml file ##
+    # volumes:
+    #  - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
+    # command:
+    #  - "--config=/app/config.yaml"
+    ##############################################
+    ports:
+      - "4000:4000" # Map the container port to the host, change the host port if necessary
+    environment:
+        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
+        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
+    env_file:
+      - .env # Load local .env file
+    depends_on:
+      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
+    healthcheck:  # Defines the health check configuration for the container
+      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
+      interval: 30s  # Perform health check every 30 seconds
+      timeout: 10s   # Health check command times out after 10 seconds
+      retries: 3     # Retry up to 3 times if health check fails
+      start_period: 40s  # Wait 40 seconds after container start before beginning health checks
+
+ 
  db:
    image: postgres:16
    restart: always
@ -16,3 +46,23 @@ services:
      interval: 1s
      timeout: 5s
      retries: 10
+  
+  prometheus:
+    image: prom/prometheus
+    volumes:
+      - prometheus_data:/prometheus
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+    ports:
+      - "9090:9090"
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=15d'
+    restart: always
+
+volumes:
+  prometheus_data:
+    driver: local
+  postgres_data:
+    name: litellm_postgres_data  # Named volume for Postgres data persistence
+
--- a/docs/my-website/docs/mcp.md
+++ b/docs/my-website/docs/mcp.md
@ -4,21 +4,177 @@ import Image from '@theme/IdealImage';

 # /mcp [BETA] - Model Context Protocol

-Use Model Context Protocol with LiteLLM
+## Expose MCP tools on LiteLLM Proxy Server

+This allows you to define tools that can be called by any MCP compatible client. Define your `mcp_servers` with LiteLLM and all your clients can list and call available tools.

 <Image 
-  img={require('../img/litellm_mcp.png')}
+  img={require('../img/mcp_2.png')}
  style={{width: '100%', display: 'block', margin: '2rem auto'}}
 />
 <p style={{textAlign: 'left', color: '#666'}}>
  LiteLLM MCP Architecture: Use MCP tools with all LiteLLM supported models
 </p>

+#### How it works

-## Overview
+LiteLLM exposes the following MCP endpoints:

-LiteLLM acts as a MCP bridge to utilize MCP tools with all LiteLLM supported models. LiteLLM offers the following features for using MCP
+- `/mcp/tools/list` - List all available tools
+- `/mcp/tools/call` - Call a specific tool with the provided arguments
+
+When MCP clients connect to LiteLLM they can follow this workflow:
+
+1. Connect to the LiteLLM MCP server
+2. List all available tools on LiteLLM
+3. Client makes LLM API request with tool call(s)
+4. LLM API returns which tools to call and with what arguments
+5. MCP client makes MCP tool calls to LiteLLM
+6. LiteLLM makes the tool calls to the appropriate MCP server
+7. LiteLLM returns the tool call results to the MCP client
+
+#### Usage
+
+#### 1. Define your tools on under `mcp_servers` in your config.yaml file.
+
+LiteLLM allows you to define your tools on the `mcp_servers` section in your config.yaml file. All tools listed here will be available to MCP clients (when they connect to LiteLLM and call `list_tools`).
+
+```yaml title="config.yaml" showLineNumbers
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: sk-xxxxxxx
+
+mcp_servers:
+  {
+    "zapier_mcp": {
+      "url": "https://actions.zapier.com/mcp/sk-akxxxxx/sse"
+    },
+    "fetch": {
+      "url": "http://localhost:8000/sse"
+    }
+  }
+```
+
+
+#### 2. Start LiteLLM Gateway
+
+<Tabs>
+<TabItem value="docker" label="Docker Run">
+
+```shell title="Docker Run" showLineNumbers
+docker run -d \
+  -p 4000:4000 \
+  -e OPENAI_API_KEY=$OPENAI_API_KEY \
+  --name my-app \
+  -v $(pwd)/my_config.yaml:/app/config.yaml \
+  my-app:latest \
+  --config /app/config.yaml \
+  --port 4000 \
+  --detailed_debug \
+```
+
+</TabItem>
+
+<TabItem value="py" label="litellm pip">
+
+```shell title="litellm pip" showLineNumbers
+litellm --config config.yaml --detailed_debug
+```
+
+</TabItem>
+</Tabs>
+
+
+#### 3. Make an LLM API request 
+
+In this example we will do the following:
+
+1. Use MCP client to list MCP tools on LiteLLM Proxy
+2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools
+3. Provide the MCP tools to `gpt-4o`
+4. Handle tool call from `gpt-4o`
+5. Convert OpenAI tool call to MCP tool call
+6. Execute tool call on MCP server
+
+```python title="MCP Client List Tools" showLineNumbers
+import asyncio
+from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletionUserMessageParam
+from mcp import ClientSession
+from mcp.client.sse import sse_client
+from litellm.experimental_mcp_client.tools import (
+    transform_mcp_tool_to_openai_tool,
+    transform_openai_tool_call_request_to_mcp_tool_call_request,
+)
+
+
+async def main():
+    # Initialize clients
+    
+    # point OpenAI client to LiteLLM Proxy
+    client = AsyncOpenAI(api_key="sk-1234", base_url="http://localhost:4000")
+
+    # Point MCP client to LiteLLM Proxy
+    async with sse_client("http://localhost:4000/mcp/") as (read, write):
+        async with ClientSession(read, write) as session:
+            await session.initialize()
+
+            # 1. List MCP tools on LiteLLM Proxy
+            mcp_tools = await session.list_tools()
+            print("List of MCP tools for MCP server:", mcp_tools.tools)
+
+            # Create message
+            messages = [
+                ChatCompletionUserMessageParam(
+                    content="Send an email about LiteLLM supporting MCP", role="user"
+                )
+            ]
+
+            # 2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools
+            # Since OpenAI only supports tools in the OpenAI format, we need to convert the MCP tools to the OpenAI format.
+            openai_tools = [
+                transform_mcp_tool_to_openai_tool(tool) for tool in mcp_tools.tools
+            ]
+
+            # 3. Provide the MCP tools to `gpt-4o`
+            response = await client.chat.completions.create(
+                model="gpt-4o",
+                messages=messages,
+                tools=openai_tools,
+                tool_choice="auto",
+            )
+
+            # 4. Handle tool call from `gpt-4o`
+            if response.choices[0].message.tool_calls:
+                tool_call = response.choices[0].message.tool_calls[0]
+                if tool_call:
+
+                    # 5. Convert OpenAI tool call to MCP tool call
+                    # Since MCP servers expect tools in the MCP format, we need to convert the OpenAI tool call to the MCP format.
+                    # This is done using litellm.experimental_mcp_client.tools.transform_openai_tool_call_request_to_mcp_tool_call_request
+                    mcp_call = (
+                        transform_openai_tool_call_request_to_mcp_tool_call_request(
+                            openai_tool=tool_call.model_dump()
+                        )
+                    )
+
+                    # 6. Execute tool call on MCP server
+                    result = await session.call_tool(
+                        name=mcp_call.name, arguments=mcp_call.arguments
+                    )
+
+                    print("Result:", result)
+
+
+# Run it
+asyncio.run(main())
+```
+
+## LiteLLM Python SDK MCP Bridge
+
+LiteLLM Python SDK acts as a MCP bridge to utilize MCP tools with all LiteLLM supported models. LiteLLM offers the following features for using MCP

 - **List** Available MCP Tools: OpenAI clients can view all available MCP tools
  - `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools
@ -26,8 +182,6 @@ LiteLLM acts as a MCP bridge to utilize MCP tools with all LiteLLM supported mod
  - `litellm.experimental_mcp_client.call_openai_tool` to call an OpenAI tool on an MCP server


-## Usage
-
 ### 1. List Available MCP Tools

 In this example we'll use `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools on any MCP server. This method can be used in two ways:
@ -271,215 +425,3 @@ async with stdio_client(server_params) as (read, write):

 </TabItem>
 </Tabs>
-
-## Upcoming Features
-
-:::info
-
-**This feature is not live as yet** this is a beta interface. Expect this to be live on litellm `v1.63.15` and above.
-
-:::
-
-
-### Expose MCP tools on LiteLLM Proxy Server
-
-This allows you to define tools that can be called by any MCP compatible client. Define your mcp_tools with LiteLLM and all your clients can list and call available tools.
-
-#### How it works
-
-LiteLLM exposes the following MCP endpoints:
-
- `/mcp/list_tools` - List all available tools
- `/mcp/call_tool` - Call a specific tool with the provided arguments
-
-When MCP clients connect to LiteLLM they can follow this workflow:
-
-1. Connect to the LiteLLM MCP server
-2. List all available tools on LiteLLM
-3. Client makes LLM API request with tool call(s)
-4. LLM API returns which tools to call and with what arguments
-5. MCP client makes tool calls to LiteLLM
-6. LiteLLM makes the tool calls to the appropriate handlers
-7. LiteLLM returns the tool call results to the MCP client
-
-#### Usage
-
-#### 1. Define your tools on mcp_tools
-
-LiteLLM allows you to define your tools on the `mcp_tools` section in your config.yaml file. All tools listed here will be available to MCP clients (when they connect to LiteLLM and call `list_tools`).
-
-```yaml
-model_list:
-  - model_name: gpt-4o
-    litellm_params:
-      model: openai/gpt-4o
-      api_key: sk-xxxxxxx
-
-
-
-mcp_tools:
-  - name: "get_current_time"
-    description: "Get the current time"
-    input_schema: {
-      "type": "object",
-      "properties": {
-        "format": {
-          "type": "string",
-          "description": "The format of the time to return",
-          "enum": ["short"]
-        }
-      }
-    }
-    handler: "mcp_tools.get_current_time"
-```
-
-#### 2. Define a handler for your tool
-
-Create a new file called `mcp_tools.py` and add this code. The key method here is `get_current_time` which gets executed when the `get_current_time` tool is called.
-
-```python
-# mcp_tools.py
-
-from datetime import datetime
-
-def get_current_time(format: str = "short"):
-    """
-    Simple handler for the 'get_current_time' tool.
-    
-    Args:
-        format (str): The format of the time to return ('short').
-    
-    Returns:
-        str: The current time formatted as 'HH:MM'.
-    """
-    # Get the current time
-    current_time = datetime.now()
-    
-    # Format the time as 'HH:MM'
-    return current_time.strftime('%H:%M')
-```
-
-#### 3. Start LiteLLM Gateway
-
-<Tabs>
-<TabItem value="docker" label="Docker Run">
-
-Mount your `mcp_tools.py` on the LiteLLM Docker container.
-
-```shell
-docker run -d \
-  -p 4000:4000 \
-  -e OPENAI_API_KEY=$OPENAI_API_KEY \
-  --name my-app \
-  -v $(pwd)/my_config.yaml:/app/config.yaml \
-  -v $(pwd)/mcp_tools.py:/app/mcp_tools.py \
-  my-app:latest \
-  --config /app/config.yaml \
-  --port 4000 \
-  --detailed_debug \
-```
-
-</TabItem>
-
-<TabItem value="py" label="litellm pip">
-
-```shell
-litellm --config config.yaml --detailed_debug
-```
-
-</TabItem>
-</Tabs>
-
-
-#### 4. Make an LLM API request 
-
-
-
-```python
-import asyncio
-from langchain_mcp_adapters.tools import load_mcp_tools
-from langchain_openai import ChatOpenAI
-from langgraph.prebuilt import create_react_agent
-from mcp import ClientSession
-from mcp.client.sse import sse_client
-
-
-async def main():
-    # Initialize the model with your API key
-    model = ChatOpenAI(model="gpt-4o")
-    
-    # Connect to the MCP server
-    async with sse_client(url="http://localhost:4000/mcp/") as (read, write):
-        async with ClientSession(read, write) as session:
-            # Initialize the session
-            print("Initializing session...")
-            await session.initialize()
-            print("Session initialized")
-
-            # Load available tools from MCP
-            print("Loading tools...")
-            tools = await load_mcp_tools(session)
-            print(f"Loaded {len(tools)} tools")
-
-            # Create a ReAct agent with the model and tools
-            agent = create_react_agent(model, tools)
-            
-            # Run the agent with a user query
-            user_query = "What's the weather in Tokyo?"
-            print(f"Asking: {user_query}")
-            agent_response = await agent.ainvoke({"messages": user_query})
-            print("Agent response:")
-            print(agent_response)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-
-```
-
-
-### Specification for `mcp_tools`
-
-The `mcp_tools` section in your LiteLLM config defines tools that can be called by MCP-compatible clients.
-
-#### Tool Definition Format
-
-```yaml
-mcp_tools:
-  - name: string                # Required: Name of the tool
-    description: string         # Required: Description of what the tool does
-    input_schema: object        # Required: JSON Schema defining the tool's input parameters
-    handler: string             # Required: Path to the function that implements the tool
-```
-
-#### Field Details
-
- `name`: A unique identifier for the tool
- `description`: A clear description of what the tool does, used by LLMs to determine when to call it
- `input_schema`: JSON Schema object defining the expected input parameters
- `handler`: String path to the Python function that implements the tool (e.g., "module.submodule.function_name")
-
-#### Example Tool Definition
-
-```yaml
-mcp_tools:
-  - name: "get_current_time"
-    description: "Get the current time in a specified format"
-    input_schema: {
-      "type": "object",
-      "properties": {
-        "format": {
-          "type": "string",
-          "description": "The format of the time to return",
-          "enum": ["short", "long", "iso"]
-        },
-        "timezone": {
-          "type": "string",
-          "description": "The timezone to use (e.g., 'UTC', 'America/New_York')",
-          "default": "UTC"
-        }
-      },
-      "required": ["format"]
-    }
-    handler: "mcp_tools.get_current_time"
-```
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -664,6 +664,58 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+## Usage - Latency Optimized Inference
+
+Valid from v1.65.1+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+response = completion(
+    model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    performanceConfig={"latency": "optimized"},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      performanceConfig: {"latency": "optimized"} # 👈 EITHER HERE OR ON REQUEST
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "bedrock-claude-3-7",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "performanceConfig": {"latency": "optimized"} # 👈 EITHER HERE OR ON CONFIG.YAML
+  }'
+```
+
+</TabItem>
+</Tabs>
+
 ## Usage - Bedrock Guardrails

 Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
@ -1776,6 +1828,7 @@ response = completion(
 )
 ```
 </TabItem>
+
 <TabItem value="proxy" label="PROXY">

 1. Setup config.yaml 
@ -1820,11 +1873,13 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 ```

 </TabItem>
+
 </Tabs>

 ### SSO Login (AWS Profile)
 - Set `AWS_PROFILE` environment variable
 - Make bedrock completion call
+
 ```python
 import os
 from litellm import completion
@ -1917,12 +1972,46 @@ model_list:

 </Tabs>

+Text to Image : 
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/images/generations' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "model": "amazon.nova-canvas-v1:0",
+    "prompt": "A cute baby sea otter"
+}'
+```
+
+Color Guided Generation:
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/images/generations' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "model": "amazon.nova-canvas-v1:0",
+    "prompt": "A cute baby sea otter",
+    "taskType": "COLOR_GUIDED_GENERATION",
+    "colorGuidedGenerationParams":{"colors":["#FFFFFF"]}
+}'
+```
+
+| Model Name              | Function Call                               |
+|-------------------------|---------------------------------------------|
+| Stable Diffusion 3 - v0 | `image_generation(model="bedrock/stability.stability.sd3-large-v1:0", prompt=prompt)` |
+| Stable Diffusion - v0   | `image_generation(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
+| Stable Diffusion - v1   | `image_generation(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |
+| Amazon Nova Canvas - v0 | `image_generation(model="bedrock/amazon.nova-canvas-v1:0", prompt=prompt)` |
+  
  
 ### Passing an external BedrockRuntime.Client as a parameter - Completion()
  
+This is a deprecated flow. Boto3 is not async. And boto3.client does not let us make the http call through httpx. Pass in your aws params through the method above 👆. [See Auth Code](https://github.com/BerriAI/litellm/blob/55a20c7cce99a93d36a82bf3ae90ba3baf9a7f89/litellm/llms/bedrock_httpx.py#L284) [Add new auth flow](https://github.com/BerriAI/litellm/issues)
+
 :::warning

-This is a deprecated flow. Boto3 is not async. And boto3.client does not let us make the http call through httpx. Pass in your aws params through the method above 👆. [See Auth Code](https://github.com/BerriAI/litellm/blob/55a20c7cce99a93d36a82bf3ae90ba3baf9a7f89/litellm/llms/bedrock_httpx.py#L284) [Add new auth flow](https://github.com/BerriAI/litellm/issues)
+
+


 Experimental - 2024-Jun-23:
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -589,8 +589,10 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the audio."},
                {
-                    "type": "image_url",
-                    "image_url": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
+                    "type": "file",
+                    "file": {
+                        "file_data": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
+                    }
                },
            ],
        }
@ -640,8 +642,11 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the file."},
                {
-                    "type": "image_url",
-                    "image_url": "https://storage..." # 👈 SET THE IMG URL
+                    "type": "file",
+                    "file": {
+                        "file_id": "https://storage...", # 👈 SET THE IMG URL
+                        "format": "application/pdf" # OPTIONAL
+                    }
                },
            ],
        }
@ -668,8 +673,11 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the file."},
                {
-                    "type": "image_url",
-                    "image_url": "gs://..." # 👈 SET THE cloud storage bucket url
+                    "type": "file",
+                    "file": {
+                        "file_id": "gs://storage...", # 👈 SET THE IMG URL
+                        "format": "application/pdf" # OPTIONAL
+                    }
                },
            ],
        }
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -325,6 +325,74 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 | fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |


+## OpenAI Audio Transcription
+
+LiteLLM supports OpenAI Audio Transcription endpoint.
+
+Supported models:
+
+| Model Name                | Function Call                                                          |
+|---------------------------|-----------------------------------------------------------------|
+| `whisper-1`    | `response = completion(model="whisper-1", file=audio_file)`     |
+| `gpt-4o-transcribe` | `response = completion(model="gpt-4o-transcribe", file=audio_file)` |
+| `gpt-4o-mini-transcribe` | `response = completion(model="gpt-4o-mini-transcribe", file=audio_file)` |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import transcription
+import os 
+
+# set api keys 
+os.environ["OPENAI_API_KEY"] = ""
+audio_file = open("/path/to/audio.mp3", "rb")
+
+response = transcription(model="gpt-4o-transcribe", file=audio_file)
+
+print(f"response: {response}")
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+- model_name: gpt-4o-transcribe
+  litellm_params:
+    model: gpt-4o-transcribe
+    api_key: os.environ/OPENAI_API_KEY
+  model_info:
+    mode: audio_transcription
+    
+general_settings:
+  master_key: sk-1234
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
+--header 'Authorization: Bearer sk-1234' \
+--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
+--form 'model="gpt-4o-transcribe"'
+```
+
+
+
+</TabItem>
+</Tabs>
+
+
+
 ## Advanced

 ### Getting OpenAI API Response Headers 
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -1369,6 +1369,103 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>


+## Gemini Pro
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
+
+## Fine-tuned Models
+
+You can call fine-tuned Vertex AI Gemini models through LiteLLM
+
+| Property | Details |
+|----------|---------|
+| Provider Route | `vertex_ai/gemini/{MODEL_ID}` |
+| Vertex Documentation | [Vertex AI - Fine-tuned Gemini Models](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-use-supervised-tuning#test_the_tuned_model_with_a_prompt)|
+| Supported Operations | `/chat/completions`, `/completions`, `/embeddings`, `/images` |
+
+To use a model that follows the `/gemini` request/response format, simply set the model parameter as 
+
+```python title="Model parameter for calling fine-tuned gemini models"
+model="vertex_ai/gemini/<your-finetuned-model>"
+```
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM Python SDK">
+
+```python showLineNumbers title="Example"
+import litellm
+import os
+
+## set ENV variables
+os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+response = litellm.completion(
+  model="vertex_ai/gemini/<your-finetuned-model>",  # e.g. vertex_ai/gemini/4965075652664360960
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">
+
+1. Add Vertex Credentials to your env 
+
+```bash title="Authenticate to Vertex AI"
+!gcloud auth application-default login
+```
+
+2. Setup config.yaml 
+
+```yaml showLineNumbers title="Add to litellm config"
+- model_name: finetuned-gemini
+  litellm_params:
+    model: vertex_ai/gemini/<ENDPOINT_ID>
+    vertex_project: <PROJECT_ID>
+    vertex_location: <LOCATION>
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers title="Example request"
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-litellm-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="finetuned-gemini",
+    messages=[
+        {"role": "user", "content": "hi"}
+    ]
+)
+print(response)
+```
+
+</TabItem>
+<TabItem value="curl" label="curl">
+
+```bash showLineNumbers title="Example request"
+curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: <LITELLM_KEY>' \
+--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
+```
+
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+
+
 ## Model Garden

 :::tip
@ -1479,67 +1576,6 @@ response = completion(
 </Tabs>


-## Gemini Pro
-| Model Name       | Function Call                        |
-|------------------|--------------------------------------|
-| gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
-
-## Fine-tuned Models
-
-Fine tuned models on vertex have a numerical model/endpoint id. 
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import completion
-import os
-
-## set ENV variables
-os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
-os.environ["VERTEXAI_LOCATION"] = "us-central1"
-
-response = completion(
-  model="vertex_ai/<your-finetuned-model>",  # e.g. vertex_ai/4965075652664360960
-  messages=[{ "content": "Hello, how are you?","role": "user"}],
-  base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add Vertex Credentials to your env 
-
-```bash
-!gcloud auth application-default login
-```
-
-2. Setup config.yaml 
-
-```yaml
- model_name: finetuned-gemini
-  litellm_params:
-    model: vertex_ai/<ENDPOINT_ID>
-    vertex_project: <PROJECT_ID>
-    vertex_location: <LOCATION>
-  model_info:
-    base_model: vertex_ai/gemini-1.5-pro # IMPORTANT
-```
-
-3. Test it! 
-
-```bash
-curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: <LITELLM_KEY>' \
--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
-```
-
-</TabItem>
-</Tabs>
-
-

 ## Gemini Pro Vision
 | Model Name       | Function Call                        |
@ -1684,23 +1720,25 @@ assert isinstance(
 ```


-## Usage - PDF / Videos / etc. Files 
+## Usage - PDF / Videos / Audio etc. Files 

 Pass any file supported by Vertex AI, through LiteLLM. 

-LiteLLM Supports the following image types passed in url
+LiteLLM Supports the following file types passed in url. 
+
+Using `file` message type for VertexAI is live from v1.65.1+ 

 ```
-Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
-Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
+Files with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
+Files with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
 Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
-Base64 Encoded Local Images
+Base64 Encoded Local Files
 ```

 <Tabs>
 <TabItem value="sdk" label="SDK">

-### **Using `gs://`**
+### **Using `gs://` or any URL**
 ```python
 from litellm import completion

@ -1712,8 +1750,11 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                        "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
+                        "format": "application/pdf" # OPTIONAL - specify mime-type
+                    }
                },
            ],
        }
@ -1747,8 +1788,16 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                        "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    }  
+                },
+                {
+                    "type": "audio_input",
+                    "audio_input {
+                        "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
+                    }  
                },
            ],
        }
@ -1794,8 +1843,11 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf" # 👈 PDF
+                "type": "file",
+                "file": {
+                    "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
+                    "format": "application/pdf" # OPTIONAL
+                }
            }
          }
        ]
@ -1822,10 +1874,17 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
-            }
+                "type": "file",
+                "file": {
+                    "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                },
+            },
+            {
+                "type": "audio_input",
+                "audio_input {
+                    "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
                }  
+            },
    ]
      }
    ],
@ -1836,6 +1895,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+
 ## Chat Models
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
@ -2044,7 +2104,12 @@ print(response)

 ## **Multi-Modal Embeddings**

-Usage
+
+Known Limitations:
+- Only supports 1 image / video / image per request
+- Only supports GCS or base64 encoded images / videos
+
+### Usage

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -2260,6 +2325,115 @@ print(f"Text Embedding: {embeddings.text_embedding}")
 </Tabs>


+### Text + Image + Video Embeddings
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+Text + Image 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"] # will be sent as a gcs image
+)
+```
+
+Text + Video 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
+)
+```
+
+Image + Video 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
+)
+```
+
+
+</TabItem>
+<TabItem value="proxy" label="LiteLLM PROXY (Unified Endpoint)">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: multimodalembedding@001
+    litellm_params:
+      model: vertex_ai/multimodalembedding@001
+      vertex_project: "adroit-crow-413218"
+      vertex_location: "us-central1"
+      vertex_credentials: adroit-crow-413218-a956eef1a2a8.json 
+
+litellm_settings:
+  drop_params: True
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request use OpenAI Python SDK, Langchain Python SDK
+
+
+Text + Image 
+
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"],
+)
+
+print(response)
+```
+
+Text + Video 
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"],
+)
+
+print(response)
+```
+
+Image + Video 
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"],
+)
+
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+
 ## **Image Generation Models**

 Usage 
--- a/docs/my-website/docs/proxy/admin_ui_sso.md
+++ b/docs/my-website/docs/proxy/admin_ui_sso.md
@ -147,6 +147,11 @@ Some SSO providers require a specific redirect url for login and logout. You can
 - Login: `<your-proxy-base-url>/sso/key/generate`
 - Logout: `<your-proxy-base-url>`

+Here's the env var to set the logout url on the proxy
+```bash
+PROXY_LOGOUT_URL="https://www.google.com"
+```
+
 #### Step 3. Set `PROXY_BASE_URL` in your .env

 Set this in your .env (so the proxy can set the correct redirect url)
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -160,7 +160,7 @@ general_settings:
 | database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
 | database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
 | database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
-| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
+| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key [Doc on graceful db unavailability](prod#5-if-running-litellm-on-vpc-gracefully-handle-db-unavailability) |
 | custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
 | max_parallel_requests | integer | The max parallel requests allowed per deployment |
 | global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
@ -479,7 +479,7 @@ router_settings:
 | PROXY_ADMIN_ID | Admin identifier for proxy server
 | PROXY_BASE_URL | Base URL for proxy service
 | PROXY_LOGOUT_URL | URL for logging out of the proxy service
-| PROXY_MASTER_KEY | Master key for proxy authentication
+| LITELLM_MASTER_KEY | Master key for proxy authentication
 | QDRANT_API_BASE | Base URL for Qdrant API
 | QDRANT_API_KEY | API key for Qdrant service
 | QDRANT_URL | Connection URL for Qdrant database
@ -515,4 +515,5 @@ router_settings:
 | UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
 | UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
 | USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
+| USE_PRISMA_MIGRATE | Flag to use prisma migrate instead of prisma db push. Recommended for production environments.
 | WEBHOOK_URL | URL for receiving webhooks from external services
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -94,15 +94,31 @@ This disables the load_dotenv() functionality, which will automatically load you

 ## 5. If running LiteLLM on VPC, gracefully handle DB unavailability

-This will allow LiteLLM to continue to process requests even if the DB is unavailable. This is better handling for DB unavailability.
+When running LiteLLM on a VPC (and inaccessible from the public internet), you can enable graceful degradation so that request processing continues even if the database is temporarily unavailable.
+

 **WARNING: Only do this if you're running LiteLLM on VPC, that cannot be accessed from the public internet.**

-```yaml
+#### Configuration
+
+```yaml showLineNumbers title="litellm config.yaml"
 general_settings:
  allow_requests_on_db_unavailable: True
 ```

+#### Expected Behavior
+
+When `allow_requests_on_db_unavailable` is set to `true`, LiteLLM will handle errors as follows:
+
+| Type of Error | Expected Behavior | Details |
+|---------------|-------------------|----------------|
+| Prisma Errors | ✅ Request will be allowed | Covers issues like DB connection resets or rejections from the DB via Prisma, the ORM used by LiteLLM. |
+| Httpx Errors | ✅ Request will be allowed | Occurs when the database is unreachable, allowing the request to proceed despite the DB outage. |
+| Pod Startup Behavior | ✅ Pods start regardless | LiteLLM Pods will start even if the database is down or unreachable, ensuring higher uptime guarantees for deployments. |
+| Health/Readiness Check | ✅ Always returns 200 OK | The /health/readiness endpoint returns a 200 OK status to ensure that pods remain operational even when the database is unavailable.
+| LiteLLM Budget Errors or Model Errors | ❌ Request will be blocked | Triggered when the DB is reachable but the authentication token is invalid, lacks access, or exceeds budget limits. |
+
+
 ## 6. Disable spend_logs & error_logs if not using the LiteLLM UI

 By default, LiteLLM writes several types of logs to the database:
@ -183,93 +199,3 @@ You should only see the following level of details in logs on the proxy server
 # INFO:     192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
 # INFO:     192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
 ```
-
-
-### Machine Specifications to Deploy LiteLLM
-
-| Service | Spec | CPUs | Memory | Architecture | Version|
-| --- | --- | --- | --- | --- | --- | 
-| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
-| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
-
-
-### Reference Kubernetes Deployment YAML
-
-Reference Kubernetes `deployment.yaml` that was load tested by us
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: litellm-deployment
-spec:
-  replicas: 3
-  selector:
-    matchLabels:
-      app: litellm
-  template:
-    metadata:
-      labels:
-        app: litellm
-    spec:
-      containers:
-        - name: litellm-container
-          image: ghcr.io/berriai/litellm:main-latest
-          imagePullPolicy: Always
-          env:
-            - name: AZURE_API_KEY
-              value: "d6******"
-            - name: AZURE_API_BASE
-              value: "https://ope******"
-            - name: LITELLM_MASTER_KEY
-              value: "sk-1234"
-            - name: DATABASE_URL
-              value: "po**********"
-          args:
-            - "--config"
-            - "/app/proxy_config.yaml"  # Update the path to mount the config file
-          volumeMounts:                 # Define volume mount for proxy_config.yaml
-            - name: config-volume
-              mountPath: /app
-              readOnly: true
-          livenessProbe:
-            httpGet:
-              path: /health/liveliness
-              port: 4000
-            initialDelaySeconds: 120
-            periodSeconds: 15
-            successThreshold: 1
-            failureThreshold: 3
-            timeoutSeconds: 10
-          readinessProbe:
-            httpGet:
-              path: /health/readiness
-              port: 4000
-            initialDelaySeconds: 120
-            periodSeconds: 15
-            successThreshold: 1
-            failureThreshold: 3
-            timeoutSeconds: 10
-      volumes:  # Define volume to mount proxy_config.yaml
-        - name: config-volume
-          configMap:
-            name: litellm-config  
-
-```
-
-
-Reference Kubernetes `service.yaml` that was load tested by us
-```yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: litellm-service
-spec:
-  selector:
-    app: litellm
-  ports:
-    - protocol: TCP
-      port: 4000
-      targetPort: 4000
-  type: LoadBalancer
-```
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -188,7 +188,13 @@ Currently implemented for:
 - OpenAI (if OPENAI_API_KEY is set)
 - Fireworks AI (if FIREWORKS_AI_API_KEY is set)
 - LiteLLM Proxy (if LITELLM_PROXY_API_KEY is set)
+- Gemini (if GEMINI_API_KEY is set)
+- XAI (if XAI_API_KEY is set)   
+- Anthropic (if ANTHROPIC_API_KEY is set)

+You can also specify a custom provider to check:
+
+**All providers**:
 ```python
 from litellm import get_valid_models

@ -196,6 +202,14 @@ valid_models = get_valid_models(check_provider_endpoint=True)
 print(valid_models)
 ```

+**Specific provider**:
+```python
+from litellm import get_valid_models
+
+valid_models = get_valid_models(check_provider_endpoint=True, custom_llm_provider="openai")
+print(valid_models)
+```
+
 ### `validate_environment(model: str)`

 This helper tells you if you have all the required environment variables for a model, and if not - what's missing. 
--- a/docs/my-website/docs/tutorials/openweb_ui.md
+++ b/docs/my-website/docs/tutorials/openweb_ui.md
@ -98,6 +98,5 @@ On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`

 <Image img={require('../../img/litellm_thinking_openweb.gif')} />

-
-
-
+## Additional Resources
+- Running LiteLLM and OpenWebUI on Windows Localhost: A Comprehensive Guide [https://www.tanyongsheng.com/note/running-litellm-and-openwebui-on-windows-localhost-a-comprehensive-guide/](https://www.tanyongsheng.com/note/running-litellm-and-openwebui-on-windows-localhost-a-comprehensive-guide/)
--- a/docs/my-website/img/mcp_2.png
+++ b/docs/my-website/img/mcp_2.png
--- a/docs/my-website/img/mcp_ui.png
+++ b/docs/my-website/img/mcp_ui.png
--- a/docs/my-website/img/release_notes/mcp_ui.png
+++ b/docs/my-website/img/release_notes/mcp_ui.png
--- a/docs/my-website/img/release_notes/team_model_add.png
+++ b/docs/my-website/img/release_notes/team_model_add.png
--- a/docs/my-website/img/release_notes/ui_usage.png
+++ b/docs/my-website/img/release_notes/ui_usage.png
--- a/docs/my-website/release_notes/v1.63.14/index.md
+++ b/docs/my-website/release_notes/v1.63.14/index.md
@ -24,6 +24,7 @@ This release brings:
 - LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
 - Perf improvements for Usage-based Routing
 - Streaming guardrail support via websockets
+- Azure OpenAI client perf fix (from previous release)

 ## Docker Run LiteLLM Proxy

@ -31,7 +32,7 @@ This release brings:
 docker run
 -e STORE_MODEL_IN_DB=True
 -p 4000:4000
-ghcr.io/berriai/litellm:main-v1.63.14-stable
+ghcr.io/berriai/litellm:main-v1.63.14-stable.patch1
 ```

 ## Demo Instance
--- a/docs/my-website/release_notes/v1.65.0-stable/index.md
+++ b/docs/my-website/release_notes/v1.65.0-stable/index.md
@ -0,0 +1,160 @@
+---
+title: v1.65.0-stable - Model Context Protocol
+slug: v1.65.0-stable
+date: 2025-03-30T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+tags: [mcp, custom_prompt_management]
+hide_table_of_contents: false
+---
+import Image from '@theme/IdealImage';
+
+v1.65.0-stable is live now. Here are the key highlights of this release:
+- **MCP Support**: Support for adding and using MCP servers on the LiteLLM proxy.
+- **UI view total usage after 1M+ logs**: You can now view usage analytics after crossing 1M+ logs in DB. 
+
+## Model Context Protocol (MCP)
+
+This release introduces support for centrally adding MCP servers on LiteLLM. This allows you to add MCP server endpoints and your developers can `list` and `call` MCP tools through LiteLLM.
+
+Read more about MCP [here](https://docs.litellm.ai/docs/mcp).
+
+<Image 
+  img={require('../../img/release_notes/mcp_ui.png')}
+  style={{width: '100%', display: 'block', margin: '2rem auto'}}
+/>
+<p style={{textAlign: 'left', color: '#666'}}>
+  Expose and use MCP servers through LiteLLM
+</p>
+
+## UI view total usage after 1M+ logs
+
+This release brings the ability to view total usage analytics even after exceeding 1M+ logs in your database. We've implemented a scalable architecture that stores only aggregate usage data, resulting in significantly more efficient queries and reduced database CPU utilization.
+
+
+<Image 
+  img={require('../../img/release_notes/ui_usage.png')}
+  style={{width: '100%', display: 'block', margin: '2rem auto'}}
+/>
+<p style={{textAlign: 'left', color: '#666'}}>
+  View total usage after 1M+ logs
+</p>
+
+
+- How this works:
+    - We now aggregate usage data into a dedicated DailyUserSpend table, significantly reducing query load and CPU usage even beyond 1M+ logs.
+
+- Daily Spend Breakdown API:
+
+    - Retrieve granular daily usage data (by model, provider, and API key) with a single endpoint.
+    Example Request:
+
+    ```shell title="Daily Spend Breakdown API" showLineNumbers
+    curl -L -X GET 'http://localhost:4000/user/daily/activity?start_date=2025-03-20&end_date=2025-03-27' \
+    -H 'Authorization: Bearer sk-...'
+    ```
+
+    ```json title="Daily Spend Breakdown API Response" showLineNumbers
+    {
+        "results": [
+            {
+                "date": "2025-03-27",
+                "metrics": {
+                    "spend": 0.0177072,
+                    "prompt_tokens": 111,
+                    "completion_tokens": 1711,
+                    "total_tokens": 1822,
+                    "api_requests": 11
+                },
+                "breakdown": {
+                    "models": {
+                        "gpt-4o-mini": {
+                            "spend": 1.095e-05,
+                            "prompt_tokens": 37,
+                            "completion_tokens": 9,
+                            "total_tokens": 46,
+                            "api_requests": 1
+                    },
+                    "providers": { "openai": { ... }, "azure_ai": { ... } },
+                    "api_keys": { "3126b6eaf1...": { ... } }
+                }
+            }
+        ],
+        "metadata": {
+            "total_spend": 0.7274667,
+            "total_prompt_tokens": 280990,
+            "total_completion_tokens": 376674,
+            "total_api_requests": 14
+        }
+    }
+    ```
+
+
+
+
+## New Models / Updated Models
+- Support for Vertex AI gemini-2.0-flash-lite & Google AI Studio gemini-2.0-flash-lite [PR](https://github.com/BerriAI/litellm/pull/9523)
+- Support for Vertex AI Fine-Tuned LLMs [PR](https://github.com/BerriAI/litellm/pull/9542)
+- Nova Canvas image generation support [PR](https://github.com/BerriAI/litellm/pull/9525)
+- OpenAI gpt-4o-transcribe support [PR](https://github.com/BerriAI/litellm/pull/9517)
+- Added new Vertex AI text embedding model [PR](https://github.com/BerriAI/litellm/pull/9476)
+
+## LLM Translation
+- OpenAI Web Search Tool Call Support [PR](https://github.com/BerriAI/litellm/pull/9465)
+- Vertex AI topLogprobs support [PR](https://github.com/BerriAI/litellm/pull/9518) 
+- Support for sending images and video to Vertex AI multimodal embedding [Doc](https://docs.litellm.ai/docs/providers/vertex#multi-modal-embeddings)
+- Support litellm.api_base for Vertex AI + Gemini across completion, embedding, image_generation [PR](https://github.com/BerriAI/litellm/pull/9516)
+- Bug fix for returning `response_cost` when using litellm python SDK with LiteLLM Proxy [PR](https://github.com/BerriAI/litellm/commit/6fd18651d129d606182ff4b980e95768fc43ca3d)
+- Support for `max_completion_tokens` on Mistral API [PR](https://github.com/BerriAI/litellm/pull/9606)
+- Refactored Vertex AI passthrough routes - fixes unpredictable behaviour with auto-setting default_vertex_region on router model add [PR](https://github.com/BerriAI/litellm/pull/9467)
+
+## Spend Tracking Improvements
+- Log 'api_base' on spend logs [PR](https://github.com/BerriAI/litellm/pull/9509)
+- Support for Gemini audio token cost tracking [PR](https://github.com/BerriAI/litellm/pull/9535)
+- Fixed OpenAI audio input token cost tracking [PR](https://github.com/BerriAI/litellm/pull/9535)
+
+## UI
+
+### Model Management
+- Allowed team admins to add/update/delete models on UI [PR](https://github.com/BerriAI/litellm/pull/9572)
+- Added render supports_web_search on model hub [PR](https://github.com/BerriAI/litellm/pull/9469)
+
+### Request Logs
+- Show API base and model ID on request logs [PR](https://github.com/BerriAI/litellm/pull/9572)
+- Allow viewing keyinfo on request logs [PR](https://github.com/BerriAI/litellm/pull/9568)
+
+### Usage Tab
+- Added Daily User Spend Aggregate view - allows UI Usage tab to work > 1m rows [PR](https://github.com/BerriAI/litellm/pull/9538)
+- Connected UI to "LiteLLM_DailyUserSpend" spend table [PR](https://github.com/BerriAI/litellm/pull/9603)
+
+## Logging Integrations
+- Fixed StandardLoggingPayload for GCS Pub Sub Logging Integration [PR](https://github.com/BerriAI/litellm/pull/9508)
+- Track `litellm_model_name` on `StandardLoggingPayload` [Docs](https://docs.litellm.ai/docs/proxy/logging_spec#standardlogginghiddenparams)
+
+## Performance / Reliability Improvements
+- LiteLLM Redis semantic caching implementation [PR](https://github.com/BerriAI/litellm/pull/9356)
+- Gracefully handle exceptions when DB is having an outage [PR](https://github.com/BerriAI/litellm/pull/9533)
+- Allow Pods to startup + passing /health/readiness when allow_requests_on_db_unavailable: True and DB is down [PR](https://github.com/BerriAI/litellm/pull/9569)
+
+
+## General Improvements
+- Support for exposing MCP tools on litellm proxy [PR](https://github.com/BerriAI/litellm/pull/9426)
+- Support discovering Gemini, Anthropic, xAI models by calling their /v1/model endpoint [PR](https://github.com/BerriAI/litellm/pull/9530)
+- Fixed route check for non-proxy admins on JWT auth [PR](https://github.com/BerriAI/litellm/pull/9454)
+- Added baseline Prisma database migrations [PR](https://github.com/BerriAI/litellm/pull/9565)
+- View all wildcard models on /model/info [PR](https://github.com/BerriAI/litellm/pull/9572)
+
+
+## Security
+- Bumped next from 14.2.21 to 14.2.25 in UI dashboard [PR](https://github.com/BerriAI/litellm/pull/9458)
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.14-stable.patch1...v1.65.0-stable)
--- a/docs/my-website/release_notes/v1.65.0/index.md
+++ b/docs/my-website/release_notes/v1.65.0/index.md
@ -0,0 +1,34 @@
+---
+title: v1.65.0 - Team Model Add - update
+slug: v1.65.0
+date: 2025-03-28T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+tags: [management endpoints, team models, ui]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+v1.65.0 updates the `/model/new` endpoint to prevent non-team admins from creating team models.
+
+This means that only proxy admins or team admins can create team models.
+
+## Additional Changes
+
+- Allows team admins to call `/model/update` to update team models.
+- Allows team admins to call `/model/delete` to delete team models.
+- Introduces new `user_models_only` param to `/v2/model/info` - only return models added by this user.
+
+
+These changes enable team admins to add and manage models for their team on the LiteLLM UI + API.
+
+
+<Image img={require('../../img/release_notes/team_model_add.png')} />
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -304,7 +304,6 @@ const sidebars = {
            "image_variations",
          ]
        },
-        "mcp",
        {
          type: "category",
          label: "/audio",
--- a/enterprise/enterprise_hooks/secret_detection.py
+++ b/enterprise/enterprise_hooks/secret_detection.py
@ -444,9 +444,7 @@ class _ENTERPRISE_SecretDetection(CustomGuardrail):

        detected_secrets = []
        for file in secrets.files:
-
            for found_secret in secrets[file]:
-
                if found_secret.secret_value is None:
                    continue
                detected_secrets.append(
@ -471,14 +469,12 @@ class _ENTERPRISE_SecretDetection(CustomGuardrail):
        data: dict,
        call_type: str,  # "completion", "embeddings", "image_generation", "moderation"
    ):
-
        if await self.should_run_check(user_api_key_dict) is False:
            return

        if "messages" in data and isinstance(data["messages"], list):
            for message in data["messages"]:
                if "content" in message and isinstance(message["content"], str):
-
                    detected_secrets = self.scan_message_for_secrets(message["content"])

                    for secret in detected_secrets:
--- a/litellm-proxy-extras/LICENSE
+++ b/litellm-proxy-extras/LICENSE
@ -0,0 +1,26 @@
+Portions of this software are licensed as follows:
+
+* All content that resides under the "enterprise/" directory of this repository, if that directory exists, is licensed under the license defined in "enterprise/LICENSE".
+* Content outside of the above mentioned directories or restrictions above is available under the MIT license as defined below.
+---
+MIT License
+
+Copyright (c) 2023 Berri AI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/litellm-proxy-extras/README.md
+++ b/litellm-proxy-extras/README.md
@ -0,0 +1,21 @@
+Additional files for the proxy. Reduces the size of the main litellm package.
+
+Currently, only stores the migration.sql files for litellm-proxy.
+
+To install, run:
+
+```bash
+pip install litellm-proxy-extras
+```
+OR 
+
+```bash
+pip install litellm[proxy] # installs litellm-proxy-extras and other proxy dependencies.
+```
+
+To use the migrations, run:
+
+```bash
+litellm --use_prisma_migrate
+```
+
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.0-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.0-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.0.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.0.tar.gz
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.1-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.1-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.1.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.1.tar.gz
--- a/litellm-proxy-extras/litellm_proxy_extras/init.py
+++ b/litellm-proxy-extras/litellm_proxy_extras/init.py
--- a/litellm-proxy-extras/litellm_proxy_extras/_logging.py
+++ b/litellm-proxy-extras/litellm_proxy_extras/_logging.py
@ -0,0 +1,12 @@
+import logging
+
+# Set up package logger
+logger = logging.getLogger("litellm_proxy_extras")
+if not logger.handlers:  # Only add handler if none exists
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250326162113_baseline/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250326162113_baseline/migration.sql
@ -0,0 +1,360 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_BudgetTable" (
+    "budget_id" TEXT NOT NULL,
+    "max_budget" DOUBLE PRECISION,
+    "soft_budget" DOUBLE PRECISION,
+    "max_parallel_requests" INTEGER,
+    "tpm_limit" BIGINT,
+    "rpm_limit" BIGINT,
+    "model_max_budget" JSONB,
+    "budget_duration" TEXT,
+    "budget_reset_at" TIMESTAMP(3),
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_BudgetTable_pkey" PRIMARY KEY ("budget_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_CredentialsTable" (
+    "credential_id" TEXT NOT NULL,
+    "credential_name" TEXT NOT NULL,
+    "credential_values" JSONB NOT NULL,
+    "credential_info" JSONB,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_CredentialsTable_pkey" PRIMARY KEY ("credential_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_ProxyModelTable" (
+    "model_id" TEXT NOT NULL,
+    "model_name" TEXT NOT NULL,
+    "litellm_params" JSONB NOT NULL,
+    "model_info" JSONB,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_ProxyModelTable_pkey" PRIMARY KEY ("model_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_OrganizationTable" (
+    "organization_id" TEXT NOT NULL,
+    "organization_alias" TEXT NOT NULL,
+    "budget_id" TEXT NOT NULL,
+    "metadata" JSONB NOT NULL DEFAULT '{}',
+    "models" TEXT[],
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "model_spend" JSONB NOT NULL DEFAULT '{}',
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_OrganizationTable_pkey" PRIMARY KEY ("organization_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_ModelTable" (
+    "id" SERIAL NOT NULL,
+    "aliases" JSONB,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_ModelTable_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_TeamTable" (
+    "team_id" TEXT NOT NULL,
+    "team_alias" TEXT,
+    "organization_id" TEXT,
+    "admins" TEXT[],
+    "members" TEXT[],
+    "members_with_roles" JSONB NOT NULL DEFAULT '{}',
+    "metadata" JSONB NOT NULL DEFAULT '{}',
+    "max_budget" DOUBLE PRECISION,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "models" TEXT[],
+    "max_parallel_requests" INTEGER,
+    "tpm_limit" BIGINT,
+    "rpm_limit" BIGINT,
+    "budget_duration" TEXT,
+    "budget_reset_at" TIMESTAMP(3),
+    "blocked" BOOLEAN NOT NULL DEFAULT false,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "model_spend" JSONB NOT NULL DEFAULT '{}',
+    "model_max_budget" JSONB NOT NULL DEFAULT '{}',
+    "model_id" INTEGER,
+
+    CONSTRAINT "LiteLLM_TeamTable_pkey" PRIMARY KEY ("team_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_UserTable" (
+    "user_id" TEXT NOT NULL,
+    "user_alias" TEXT,
+    "team_id" TEXT,
+    "sso_user_id" TEXT,
+    "organization_id" TEXT,
+    "password" TEXT,
+    "teams" TEXT[] DEFAULT ARRAY[]::TEXT[],
+    "user_role" TEXT,
+    "max_budget" DOUBLE PRECISION,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "user_email" TEXT,
+    "models" TEXT[],
+    "metadata" JSONB NOT NULL DEFAULT '{}',
+    "max_parallel_requests" INTEGER,
+    "tpm_limit" BIGINT,
+    "rpm_limit" BIGINT,
+    "budget_duration" TEXT,
+    "budget_reset_at" TIMESTAMP(3),
+    "allowed_cache_controls" TEXT[] DEFAULT ARRAY[]::TEXT[],
+    "model_spend" JSONB NOT NULL DEFAULT '{}',
+    "model_max_budget" JSONB NOT NULL DEFAULT '{}',
+    "created_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+
+    CONSTRAINT "LiteLLM_UserTable_pkey" PRIMARY KEY ("user_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_VerificationToken" (
+    "token" TEXT NOT NULL,
+    "key_name" TEXT,
+    "key_alias" TEXT,
+    "soft_budget_cooldown" BOOLEAN NOT NULL DEFAULT false,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "expires" TIMESTAMP(3),
+    "models" TEXT[],
+    "aliases" JSONB NOT NULL DEFAULT '{}',
+    "config" JSONB NOT NULL DEFAULT '{}',
+    "user_id" TEXT,
+    "team_id" TEXT,
+    "permissions" JSONB NOT NULL DEFAULT '{}',
+    "max_parallel_requests" INTEGER,
+    "metadata" JSONB NOT NULL DEFAULT '{}',
+    "blocked" BOOLEAN,
+    "tpm_limit" BIGINT,
+    "rpm_limit" BIGINT,
+    "max_budget" DOUBLE PRECISION,
+    "budget_duration" TEXT,
+    "budget_reset_at" TIMESTAMP(3),
+    "allowed_cache_controls" TEXT[] DEFAULT ARRAY[]::TEXT[],
+    "model_spend" JSONB NOT NULL DEFAULT '{}',
+    "model_max_budget" JSONB NOT NULL DEFAULT '{}',
+    "budget_id" TEXT,
+    "organization_id" TEXT,
+    "created_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT,
+    "updated_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "updated_by" TEXT,
+
+    CONSTRAINT "LiteLLM_VerificationToken_pkey" PRIMARY KEY ("token")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_EndUserTable" (
+    "user_id" TEXT NOT NULL,
+    "alias" TEXT,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "allowed_model_region" TEXT,
+    "default_model" TEXT,
+    "budget_id" TEXT,
+    "blocked" BOOLEAN NOT NULL DEFAULT false,
+
+    CONSTRAINT "LiteLLM_EndUserTable_pkey" PRIMARY KEY ("user_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_Config" (
+    "param_name" TEXT NOT NULL,
+    "param_value" JSONB,
+
+    CONSTRAINT "LiteLLM_Config_pkey" PRIMARY KEY ("param_name")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_SpendLogs" (
+    "request_id" TEXT NOT NULL,
+    "call_type" TEXT NOT NULL,
+    "api_key" TEXT NOT NULL DEFAULT '',
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "total_tokens" INTEGER NOT NULL DEFAULT 0,
+    "prompt_tokens" INTEGER NOT NULL DEFAULT 0,
+    "completion_tokens" INTEGER NOT NULL DEFAULT 0,
+    "startTime" TIMESTAMP(3) NOT NULL,
+    "endTime" TIMESTAMP(3) NOT NULL,
+    "completionStartTime" TIMESTAMP(3),
+    "model" TEXT NOT NULL DEFAULT '',
+    "model_id" TEXT DEFAULT '',
+    "model_group" TEXT DEFAULT '',
+    "custom_llm_provider" TEXT DEFAULT '',
+    "api_base" TEXT DEFAULT '',
+    "user" TEXT DEFAULT '',
+    "metadata" JSONB DEFAULT '{}',
+    "cache_hit" TEXT DEFAULT '',
+    "cache_key" TEXT DEFAULT '',
+    "request_tags" JSONB DEFAULT '[]',
+    "team_id" TEXT,
+    "end_user" TEXT,
+    "requester_ip_address" TEXT,
+    "messages" JSONB DEFAULT '{}',
+    "response" JSONB DEFAULT '{}',
+
+    CONSTRAINT "LiteLLM_SpendLogs_pkey" PRIMARY KEY ("request_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_ErrorLogs" (
+    "request_id" TEXT NOT NULL,
+    "startTime" TIMESTAMP(3) NOT NULL,
+    "endTime" TIMESTAMP(3) NOT NULL,
+    "api_base" TEXT NOT NULL DEFAULT '',
+    "model_group" TEXT NOT NULL DEFAULT '',
+    "litellm_model_name" TEXT NOT NULL DEFAULT '',
+    "model_id" TEXT NOT NULL DEFAULT '',
+    "request_kwargs" JSONB NOT NULL DEFAULT '{}',
+    "exception_type" TEXT NOT NULL DEFAULT '',
+    "exception_string" TEXT NOT NULL DEFAULT '',
+    "status_code" TEXT NOT NULL DEFAULT '',
+
+    CONSTRAINT "LiteLLM_ErrorLogs_pkey" PRIMARY KEY ("request_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_UserNotifications" (
+    "request_id" TEXT NOT NULL,
+    "user_id" TEXT NOT NULL,
+    "models" TEXT[],
+    "justification" TEXT NOT NULL,
+    "status" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_UserNotifications_pkey" PRIMARY KEY ("request_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_TeamMembership" (
+    "user_id" TEXT NOT NULL,
+    "team_id" TEXT NOT NULL,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "budget_id" TEXT,
+
+    CONSTRAINT "LiteLLM_TeamMembership_pkey" PRIMARY KEY ("user_id","team_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_OrganizationMembership" (
+    "user_id" TEXT NOT NULL,
+    "organization_id" TEXT NOT NULL,
+    "user_role" TEXT,
+    "spend" DOUBLE PRECISION DEFAULT 0.0,
+    "budget_id" TEXT,
+    "created_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+
+    CONSTRAINT "LiteLLM_OrganizationMembership_pkey" PRIMARY KEY ("user_id","organization_id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_InvitationLink" (
+    "id" TEXT NOT NULL,
+    "user_id" TEXT NOT NULL,
+    "is_accepted" BOOLEAN NOT NULL DEFAULT false,
+    "accepted_at" TIMESTAMP(3),
+    "expires_at" TIMESTAMP(3) NOT NULL,
+    "created_at" TIMESTAMP(3) NOT NULL,
+    "created_by" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+    "updated_by" TEXT NOT NULL,
+
+    CONSTRAINT "LiteLLM_InvitationLink_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "LiteLLM_AuditLog" (
+    "id" TEXT NOT NULL,
+    "updated_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "changed_by" TEXT NOT NULL DEFAULT '',
+    "changed_by_api_key" TEXT NOT NULL DEFAULT '',
+    "action" TEXT NOT NULL,
+    "table_name" TEXT NOT NULL,
+    "object_id" TEXT NOT NULL,
+    "before_value" JSONB,
+    "updated_values" JSONB,
+
+    CONSTRAINT "LiteLLM_AuditLog_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_CredentialsTable_credential_name_key" ON "LiteLLM_CredentialsTable"("credential_name");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_TeamTable_model_id_key" ON "LiteLLM_TeamTable"("model_id");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_UserTable_sso_user_id_key" ON "LiteLLM_UserTable"("sso_user_id");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_SpendLogs_startTime_idx" ON "LiteLLM_SpendLogs"("startTime");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_SpendLogs_end_user_idx" ON "LiteLLM_SpendLogs"("end_user");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_OrganizationMembership_user_id_organization_id_key" ON "LiteLLM_OrganizationMembership"("user_id", "organization_id");
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_OrganizationTable" ADD CONSTRAINT "LiteLLM_OrganizationTable_budget_id_fkey" FOREIGN KEY ("budget_id") REFERENCES "LiteLLM_BudgetTable"("budget_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_TeamTable" ADD CONSTRAINT "LiteLLM_TeamTable_organization_id_fkey" FOREIGN KEY ("organization_id") REFERENCES "LiteLLM_OrganizationTable"("organization_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_TeamTable" ADD CONSTRAINT "LiteLLM_TeamTable_model_id_fkey" FOREIGN KEY ("model_id") REFERENCES "LiteLLM_ModelTable"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_UserTable" ADD CONSTRAINT "LiteLLM_UserTable_organization_id_fkey" FOREIGN KEY ("organization_id") REFERENCES "LiteLLM_OrganizationTable"("organization_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_VerificationToken" ADD CONSTRAINT "LiteLLM_VerificationToken_budget_id_fkey" FOREIGN KEY ("budget_id") REFERENCES "LiteLLM_BudgetTable"("budget_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_VerificationToken" ADD CONSTRAINT "LiteLLM_VerificationToken_organization_id_fkey" FOREIGN KEY ("organization_id") REFERENCES "LiteLLM_OrganizationTable"("organization_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_EndUserTable" ADD CONSTRAINT "LiteLLM_EndUserTable_budget_id_fkey" FOREIGN KEY ("budget_id") REFERENCES "LiteLLM_BudgetTable"("budget_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_TeamMembership" ADD CONSTRAINT "LiteLLM_TeamMembership_budget_id_fkey" FOREIGN KEY ("budget_id") REFERENCES "LiteLLM_BudgetTable"("budget_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_OrganizationMembership" ADD CONSTRAINT "LiteLLM_OrganizationMembership_user_id_fkey" FOREIGN KEY ("user_id") REFERENCES "LiteLLM_UserTable"("user_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_OrganizationMembership" ADD CONSTRAINT "LiteLLM_OrganizationMembership_organization_id_fkey" FOREIGN KEY ("organization_id") REFERENCES "LiteLLM_OrganizationTable"("organization_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_OrganizationMembership" ADD CONSTRAINT "LiteLLM_OrganizationMembership_budget_id_fkey" FOREIGN KEY ("budget_id") REFERENCES "LiteLLM_BudgetTable"("budget_id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_InvitationLink" ADD CONSTRAINT "LiteLLM_InvitationLink_user_id_fkey" FOREIGN KEY ("user_id") REFERENCES "LiteLLM_UserTable"("user_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_InvitationLink" ADD CONSTRAINT "LiteLLM_InvitationLink_created_by_fkey" FOREIGN KEY ("created_by") REFERENCES "LiteLLM_UserTable"("user_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "LiteLLM_InvitationLink" ADD CONSTRAINT "LiteLLM_InvitationLink_updated_by_fkey" FOREIGN KEY ("updated_by") REFERENCES "LiteLLM_UserTable"("user_id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250326171002_add_daily_user_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250326171002_add_daily_user_table/migration.sql
@ -0,0 +1,33 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_DailyUserSpend" (
+    "id" TEXT NOT NULL,
+    "user_id" TEXT NOT NULL,
+    "date" TEXT NOT NULL,
+    "api_key" TEXT NOT NULL,
+    "model" TEXT NOT NULL,
+    "model_group" TEXT,
+    "custom_llm_provider" TEXT,
+    "prompt_tokens" INTEGER NOT NULL DEFAULT 0,
+    "completion_tokens" INTEGER NOT NULL DEFAULT 0,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "LiteLLM_DailyUserSpend_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyUserSpend_date_idx" ON "LiteLLM_DailyUserSpend"("date");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyUserSpend_user_id_idx" ON "LiteLLM_DailyUserSpend"("user_id");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyUserSpend_api_key_idx" ON "LiteLLM_DailyUserSpend"("api_key");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyUserSpend_model_idx" ON "LiteLLM_DailyUserSpend"("model");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_DailyUserSpend_user_id_date_api_key_model_custom_ll_key" ON "LiteLLM_DailyUserSpend"("user_id", "date", "api_key", "model", "custom_llm_provider");
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250327180120_add_api_requests_to_daily_user_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250327180120_add_api_requests_to_daily_user_table/migration.sql
@ -0,0 +1,3 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_DailyUserSpend" ADD COLUMN     "api_requests" INTEGER NOT NULL DEFAULT 0;
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250329084805_new_cron_job_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250329084805_new_cron_job_table/migration.sql
@ -0,0 +1,14 @@
+-- CreateEnum
+CREATE TYPE "JobStatus" AS ENUM ('ACTIVE', 'INACTIVE');
+
+-- CreateTable
+CREATE TABLE "LiteLLM_CronJob" (
+    "cronjob_id" TEXT NOT NULL,
+    "pod_id" TEXT NOT NULL,
+    "status" "JobStatus" NOT NULL DEFAULT 'INACTIVE',
+    "last_updated" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "ttl" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "LiteLLM_CronJob_pkey" PRIMARY KEY ("cronjob_id")
+);
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/migration_lock.toml
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/migration_lock.toml
@ -0,0 +1 @@
+provider = "postgresql"
--- a/litellm-proxy-extras/litellm_proxy_extras/utils.py
+++ b/litellm-proxy-extras/litellm_proxy_extras/utils.py
@ -0,0 +1,80 @@
+import os
+import random
+import subprocess
+import time
+from typing import Optional
+
+from litellm_proxy_extras._logging import logger
+
+
+def str_to_bool(value: Optional[str]) -> bool:
+    if value is None:
+        return False
+    return value.lower() in ("true", "1", "t", "y", "yes")
+
+
+class ProxyExtrasDBManager:
+    @staticmethod
+    def setup_database(schema_path: str, use_migrate: bool = False) -> bool:
+        """
+        Set up the database using either prisma migrate or prisma db push
+        Uses migrations from litellm-proxy-extras package
+
+        Args:
+            schema_path (str): Path to the Prisma schema file
+            use_migrate (bool): Whether to use prisma migrate instead of db push
+
+        Returns:
+            bool: True if setup was successful, False otherwise
+        """
+        use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate
+        for attempt in range(4):
+            original_dir = os.getcwd()
+            schema_dir = os.path.dirname(schema_path)
+            os.chdir(schema_dir)
+
+            try:
+                if use_migrate:
+                    logger.info("Running prisma migrate deploy")
+                    try:
+                        # Set migrations directory for Prisma
+                        subprocess.run(
+                            ["prisma", "migrate", "deploy"],
+                            timeout=60,
+                            check=True,
+                            capture_output=True,
+                            text=True,
+                        )
+                        logger.info("prisma migrate deploy completed")
+                        return True
+                    except subprocess.CalledProcessError as e:
+                        logger.info(f"prisma db error: {e.stderr}, e: {e.stdout}")
+                        if (
+                            "P3005" in e.stderr
+                            and "database schema is not empty" in e.stderr
+                        ):
+                            logger.info("Error: Database schema is not empty")
+                            return False
+                else:
+                    # Use prisma db push with increased timeout
+                    subprocess.run(
+                        ["prisma", "db", "push", "--accept-data-loss"],
+                        timeout=60,
+                        check=True,
+                    )
+                    return True
+            except subprocess.TimeoutExpired:
+                logger.info(f"Attempt {attempt + 1} timed out")
+                time.sleep(random.randrange(5, 15))
+            except subprocess.CalledProcessError as e:
+                attempts_left = 3 - attempt
+                retry_msg = (
+                    f" Retrying... ({attempts_left} attempts left)"
+                    if attempts_left > 0
+                    else ""
+                )
+                logger.info(f"The process failed to execute. Details: {e}.{retry_msg}")
+                time.sleep(random.randrange(5, 15))
+            finally:
+                os.chdir(original_dir)
+        return False
--- a/litellm-proxy-extras/pyproject.toml
+++ b/litellm-proxy-extras/pyproject.toml
@ -0,0 +1,30 @@
+[tool.poetry]
+name = "litellm-proxy-extras"
+version = "0.1.1"
+description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
+authors = ["BerriAI"]
+readme = "README.md"
+
+
+[tool.poetry.urls]
+homepage = "https://litellm.ai"
+Homepage = "https://litellm.ai"
+repository = "https://github.com/BerriAI/litellm"
+Repository = "https://github.com/BerriAI/litellm"
+documentation = "https://docs.litellm.ai"
+Documentation = "https://docs.litellm.ai"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0, !=3.9.7"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.commitizen]
+version = "0.1.1"
+version_files = [
+    "pyproject.toml:version",
+    "../requirements.txt:litellm-proxy-extras==",
+    "../pyproject.toml:litellm-proxy-extras = {version = \""
+]
--- a/litellm-proxy-extras/tests/init.py
+++ b/litellm-proxy-extras/tests/init.py
--- a/litellm/init.py
+++ b/litellm/init.py
@ -2,7 +2,7 @@
 import warnings

 warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
-### INIT VARIABLES ##########
+### INIT VARIABLES ###########
 import threading
 import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
@ -122,19 +122,19 @@ langsmith_batch_size: Optional[int] = None
 prometheus_initialize_budget_metrics: Optional[bool] = False
 argilla_batch_size: Optional[int] = None
 datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
-gcs_pub_sub_use_v1: Optional[bool] = (
-    False  # if you want to use v1 gcs pubsub logged payload
-)
+gcs_pub_sub_use_v1: Optional[
+    bool
+] = False  # if you want to use v1 gcs pubsub logged payload
 argilla_transformation_object: Optional[Dict[str, Any]] = None
-_async_input_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
-_async_success_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
-_async_failure_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
+_async_input_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
+_async_success_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
+_async_failure_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
@ -142,18 +142,18 @@ log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
 redact_user_api_key_info: Optional[bool] = False
 filter_invalid_headers: Optional[bool] = False
-add_user_information_to_llm_headers: Optional[bool] = (
-    None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
-)
+add_user_information_to_llm_headers: Optional[
+    bool
+] = None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ### end of callbacks #############

-email: Optional[str] = (
-    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-token: Optional[str] = (
-    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
+email: Optional[
+    str
+] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+token: Optional[
+    str
+] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 telemetry = True
 max_tokens = 256  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
@ -229,24 +229,20 @@ enable_loadbalancing_on_batch_endpoints: Optional[bool] = None
 enable_caching_on_provider_specific_optional_params: bool = (
    False  # feature-flag for caching on optional params - e.g. 'top_k'
 )
-caching: bool = (
-    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-caching_with_models: bool = (
-    False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-cache: Optional[Cache] = (
-    None  # cache object <- use this - https://docs.litellm.ai/docs/caching
-)
+caching: bool = False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+caching_with_models: bool = False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+cache: Optional[
+    Cache
+] = None  # cache object <- use this - https://docs.litellm.ai/docs/caching
 default_in_memory_ttl: Optional[float] = None
 default_redis_ttl: Optional[float] = None
 default_redis_batch_cache_expiry: Optional[float] = None
 model_alias_map: Dict[str, str] = {}
 model_group_alias_map: Dict[str, str] = {}
 max_budget: float = 0.0  # set the max budget across all providers
-budget_duration: Optional[str] = (
-    None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
-)
+budget_duration: Optional[
+    str
+] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 default_soft_budget: float = (
    50.0  # by default all litellm proxy keys have a soft budget of 50.0
 )
@ -255,15 +251,11 @@ forward_traceparent_to_llm_provider: bool = False

 _current_cost = 0.0  # private variable, used if max budget is set
 error_logs: Dict = {}
-add_function_to_prompt: bool = (
-    False  # if function calling not supported by api, append function call details to system prompt
-)
+add_function_to_prompt: bool = False  # if function calling not supported by api, append function call details to system prompt
 client_session: Optional[httpx.Client] = None
 aclient_session: Optional[httpx.AsyncClient] = None
 model_fallbacks: Optional[List] = None  # Deprecated for 'litellm.fallbacks'
-model_cost_map_url: str = (
-    "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
-)
+model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
 suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
@ -285,9 +277,7 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-force_ipv4: bool = (
-    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
-)
+force_ipv4: bool = False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 module_level_aclient = AsyncHTTPHandler(
    timeout=request_timeout, client_alias="module level aclient"
 )
@ -301,13 +291,13 @@ fallbacks: Optional[List] = None
 context_window_fallbacks: Optional[List] = None
 content_policy_fallbacks: Optional[List] = None
 allowed_fails: int = 3
-num_retries_per_request: Optional[int] = (
-    None  # for the request overall (incl. fallbacks + model retries)
-)
+num_retries_per_request: Optional[
+    int
+] = None  # for the request overall (incl. fallbacks + model retries)
 ####### SECRET MANAGERS #####################
-secret_manager_client: Optional[Any] = (
-    None  # list of instantiated key management clients - e.g. azure kv, infisical, etc.
-)
+secret_manager_client: Optional[
+    Any
+] = None  # list of instantiated key management clients - e.g. azure kv, infisical, etc.
 _google_kms_resource_name: Optional[str] = None
 _key_management_system: Optional[KeyManagementSystem] = None
 _key_management_settings: KeyManagementSettings = KeyManagementSettings()
@ -813,6 +803,7 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
+from .llms.anthropic.common_utils import AnthropicModelInfo
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -848,6 +839,7 @@ from .llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
    VertexGeminiConfig,
    VertexGeminiConfig as VertexAIConfig,
 )
+from .llms.gemini.common_utils import GeminiModelInfo
 from .llms.gemini.chat.transformation import (
    GoogleAIStudioGeminiConfig,
    GoogleAIStudioGeminiConfig as GeminiConfig,  # aliased to maintain backwards compatibility
@ -950,6 +942,12 @@ openaiOSeriesConfig = OpenAIOSeriesConfig()
 from .llms.openai.chat.gpt_transformation import (
    OpenAIGPTConfig,
 )
+from .llms.openai.transcriptions.whisper_transformation import (
+    OpenAIWhisperAudioTranscriptionConfig,
+)
+from .llms.openai.transcriptions.gpt_transformation import (
+    OpenAIGPTAudioTranscriptionConfig,
+)

 openAIGPTConfig = OpenAIGPTConfig()
 from .llms.openai.chat.gpt_audio_transformation import (
@ -978,6 +976,7 @@ from .llms.fireworks_ai.embed.fireworks_ai_transformation import (
 from .llms.friendliai.chat.transformation import FriendliaiChatConfig
 from .llms.jina_ai.embedding.transformation import JinaAIEmbeddingConfig
 from .llms.xai.chat.transformation import XAIChatConfig
+from .llms.xai.common_utils import XAIModelInfo
 from .llms.volcengine import VolcEngineConfig
 from .llms.codestral.completion.transformation import CodestralTextCompletionConfig
 from .llms.azure.azure import (
@ -1047,10 +1046,10 @@ from .types.llms.custom_llm import CustomLLMItem
 from .types.utils import GenericStreamingChunk

 custom_provider_map: List[CustomLLMItem] = []
-_custom_providers: List[str] = (
-    []
-)  # internal helper util, used to track names of custom providers
-disable_hf_tokenizer_download: Optional[bool] = (
-    None  # disable huggingface tokenizer download. Defaults to openai clk100
-)
+_custom_providers: List[
+    str
+] = []  # internal helper util, used to track names of custom providers
+disable_hf_tokenizer_download: Optional[
+    bool
+] = None  # disable huggingface tokenizer download. Defaults to openai clk100
 global_disable_no_log_param: bool = False
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import sys
 from datetime import datetime
 from logging import Formatter

@ -40,9 +41,56 @@ class JsonFormatter(Formatter):
        return json.dumps(json_record)


+# Function to set up exception handlers for JSON logging
+def _setup_json_exception_handlers(formatter):
+    # Create a handler with JSON formatting for exceptions
+    error_handler = logging.StreamHandler()
+    error_handler.setFormatter(formatter)
+
+    # Setup excepthook for uncaught exceptions
+    def json_excepthook(exc_type, exc_value, exc_traceback):
+        record = logging.LogRecord(
+            name="LiteLLM",
+            level=logging.ERROR,
+            pathname="",
+            lineno=0,
+            msg=str(exc_value),
+            args=(),
+            exc_info=(exc_type, exc_value, exc_traceback),
+        )
+        error_handler.handle(record)
+
+    sys.excepthook = json_excepthook
+
+    # Configure asyncio exception handler if possible
+    try:
+        import asyncio
+
+        def async_json_exception_handler(loop, context):
+            exception = context.get("exception")
+            if exception:
+                record = logging.LogRecord(
+                    name="LiteLLM",
+                    level=logging.ERROR,
+                    pathname="",
+                    lineno=0,
+                    msg=str(exception),
+                    args=(),
+                    exc_info=None,
+                )
+                error_handler.handle(record)
+            else:
+                loop.default_exception_handler(context)
+
+        asyncio.get_event_loop().set_exception_handler(async_json_exception_handler)
+    except Exception:
+        pass
+
+
 # Create a formatter and set it for the handler
 if json_logs:
    handler.setFormatter(JsonFormatter())
+    _setup_json_exception_handlers(JsonFormatter())
 else:
    formatter = logging.Formatter(
        "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
@ -65,18 +113,24 @@ def _turn_on_json():
    handler = logging.StreamHandler()
    handler.setFormatter(JsonFormatter())

-    # Define a list of the loggers to update
-    loggers = [verbose_router_logger, verbose_proxy_logger, verbose_logger]
+    # Define all loggers to update, including root logger
+    loggers = [logging.getLogger()] + [
+        verbose_router_logger,
+        verbose_proxy_logger,
+        verbose_logger,
+    ]

    # Iterate through each logger and update its handlers
    for logger in loggers:
        # Remove all existing handlers
        for h in logger.handlers[:]:
            logger.removeHandler(h)
-
        # Add the new handler
        logger.addHandler(handler)

+    # Set up exception handlers
+    _setup_json_exception_handlers(JsonFormatter())
+

 def _turn_on_debug():
    verbose_logger.setLevel(level=logging.DEBUG)  # set package log to debug
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -202,6 +202,7 @@ def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:

 def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
    sentinel_nodes = redis_kwargs.get("sentinel_nodes")
+    sentinel_password = redis_kwargs.get("sentinel_password")
    service_name = redis_kwargs.get("service_name")

    if not sentinel_nodes or not service_name:
@ -212,7 +213,11 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
    verbose_logger.debug("init_redis_sentinel: sentinel nodes are being initialized.")

    # Set up the Sentinel client
-    sentinel = redis.Sentinel(sentinel_nodes, socket_timeout=0.1)
+    sentinel = redis.Sentinel(
+        sentinel_nodes, 
+        socket_timeout=0.1,
+        password=sentinel_password,
+    )

    # Return the master instance for the given service

--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -15,7 +15,7 @@ from .types.services import ServiceLoggerPayload, ServiceTypes
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
    OTELClass = OpenTelemetry
 else:
    Span = Any
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -153,7 +153,6 @@ def create_batch(
        )
        api_base: Optional[str] = None
        if custom_llm_provider == "openai":
-
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
                optional_params.api_base
@ -358,7 +357,6 @@ def retrieve_batch(
        _is_async = kwargs.pop("aretrieve_batch", False) is True
        api_base: Optional[str] = None
        if custom_llm_provider == "openai":
-
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
                optional_params.api_base
--- a/litellm/caching/base_cache.py
+++ b/litellm/caching/base_cache.py
@ -9,12 +9,12 @@ Has 4 methods:
 """

 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -66,9 +66,7 @@ class CachingHandlerResponse(BaseModel):

    cached_result: Optional[Any] = None
    final_embedding_cached_response: Optional[EmbeddingResponse] = None
-    embedding_all_elements_cache_hit: bool = (
-        False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
-    )
+    embedding_all_elements_cache_hit: bool = False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call


 class LLMCachingHandler:
@ -738,7 +736,6 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
-
            litellm.cache.add_cache(result, **new_kwargs)

        return
@ -865,9 +862,9 @@ class LLMCachingHandler:
        }

        if litellm.cache is not None:
-            litellm_params["preset_cache_key"] = (
-                litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
-            )
+            litellm_params[
+                "preset_cache_key"
+            ] = litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
        else:
            litellm_params["preset_cache_key"] = None

--- a/litellm/caching/disk_cache.py
+++ b/litellm/caching/disk_cache.py
@ -1,12 +1,12 @@
 import json
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union

 from .base_cache import BaseCache

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/caching/dual_cache.py
+++ b/litellm/caching/dual_cache.py
@ -12,7 +12,7 @@ import asyncio
 import time
 import traceback
 from concurrent.futures import ThreadPoolExecutor
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, Any, List, Optional, Union

 import litellm
 from litellm._logging import print_verbose, verbose_logger
@ -24,7 +24,7 @@ from .redis_cache import RedisCache
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/caching/llm_caching_handler.py
+++ b/litellm/caching/llm_caching_handler.py
@ -8,7 +8,6 @@ from .in_memory_cache import InMemoryCache


 class LLMClientCache(InMemoryCache):
-
    def update_cache_key_with_event_loop(self, key):
        """
        Add the event loop to the cache key, to prevent event loop closed errors.
--- a/litellm/caching/redis_cache.py
+++ b/litellm/caching/redis_cache.py
@ -34,7 +34,7 @@ if TYPE_CHECKING:
    cluster_pipeline = ClusterPipeline
    async_redis_client = Redis
    async_redis_cluster_client = RedisCluster
-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    pipeline = Any
    cluster_pipeline = Any
@ -57,7 +57,6 @@ class RedisCache(BaseCache):
        socket_timeout: Optional[float] = 5.0,  # default 5 second timeout
        **kwargs,
    ):
-
        from litellm._service_logger import ServiceLogging

        from .._redis import get_redis_client, get_redis_connection_pool
@ -1045,3 +1044,109 @@ class RedisCache(BaseCache):
        except Exception as e:
            verbose_logger.debug(f"Redis TTL Error: {e}")
            return None
+
+    async def async_rpush(
+        self,
+        key: str,
+        values: List[Any],
+        parent_otel_span: Optional[Span] = None,
+        **kwargs,
+    ) -> int:
+        """
+        Append one or multiple values to a list stored at key
+
+        Args:
+            key: The Redis key of the list
+            values: One or more values to append to the list
+            parent_otel_span: Optional parent OpenTelemetry span
+
+        Returns:
+            int: The length of the list after the push operation
+        """
+        _redis_client: Any = self.init_async_client()
+        start_time = time.time()
+        try:
+            response = await _redis_client.rpush(key, *values)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_rpush",
+                )
+            )
+            return response
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_rpush",
+                )
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache RPUSH: - Got exception from REDIS : {str(e)}"
+            )
+            raise e
+
+    async def async_lpop(
+        self,
+        key: str,
+        count: Optional[int] = None,
+        parent_otel_span: Optional[Span] = None,
+        **kwargs,
+    ) -> Union[Any, List[Any]]:
+        _redis_client: Any = self.init_async_client()
+        start_time = time.time()
+        print_verbose(f"LPOP from Redis list: key: {key}, count: {count}")
+        try:
+            result = await _redis_client.lpop(key, count)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_lpop",
+                )
+            )
+
+            # Handle result parsing if needed
+            if isinstance(result, bytes):
+                try:
+                    return result.decode("utf-8")
+                except Exception:
+                    return result
+            elif isinstance(result, list) and all(
+                isinstance(item, bytes) for item in result
+            ):
+                try:
+                    return [item.decode("utf-8") for item in result]
+                except Exception:
+                    return result
+            return result
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_lpop",
+                )
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache LPOP: - Got exception from REDIS : {str(e)}"
+            )
+            raise e
--- a/litellm/caching/redis_cluster_cache.py
+++ b/litellm/caching/redis_cluster_cache.py
@ -5,7 +5,7 @@ Key differences:
 - RedisClient NEEDs to be re-used across requests, adds 3000ms latency if it's re-created
 """

-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, Any, List, Optional, Union

 from litellm.caching.redis_cache import RedisCache

@ -16,7 +16,7 @@ if TYPE_CHECKING:

    pipeline = Pipeline
    async_redis_client = Redis
-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    pipeline = Any
    async_redis_client = Any
--- a/litellm/caching/redis_semantic_cache.py
+++ b/litellm/caching/redis_semantic_cache.py
@ -13,11 +13,15 @@ import ast
 import asyncio
 import json
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast

 import litellm
 from litellm._logging import print_verbose
-from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
+from litellm.litellm_core_utils.prompt_templates.common_utils import (
+    get_str_from_messages,
+)
+from litellm.types.utils import EmbeddingResponse
+
 from .base_cache import BaseCache


@ -87,14 +91,16 @@ class RedisSemanticCache(BaseCache):
        if redis_url is None:
            try:
                # Attempt to use provided parameters or fallback to environment variables
-                host = host or os.environ['REDIS_HOST']
-                port = port or os.environ['REDIS_PORT']
-                password = password or os.environ['REDIS_PASSWORD']
+                host = host or os.environ["REDIS_HOST"]
+                port = port or os.environ["REDIS_PORT"]
+                password = password or os.environ["REDIS_PASSWORD"]
            except KeyError as e:
                # Raise a more informative exception if any of the required keys are missing
                missing_var = e.args[0]
-                raise ValueError(f"Missing required Redis configuration: {missing_var}. "
-                                 f"Provide {missing_var} or redis_url.") from e
+                raise ValueError(
+                    f"Missing required Redis configuration: {missing_var}. "
+                    f"Provide {missing_var} or redis_url."
+                ) from e

            redis_url = f"redis://:{password}@{host}:{port}"

@ -137,10 +143,13 @@ class RedisSemanticCache(BaseCache):
            List[float]: The embedding vector
        """
        # Create an embedding from prompt
-        embedding_response = litellm.embedding(
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
+            ),
        )
        embedding = embedding_response["data"][0]["embedding"]
        return embedding
@ -186,6 +195,7 @@ class RedisSemanticCache(BaseCache):
        """
        print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")

+        value_str: Optional[str] = None
        try:
            # Extract the prompt from messages
            messages = kwargs.get("messages", [])
@ -203,7 +213,9 @@ class RedisSemanticCache(BaseCache):
            else:
                self.llmcache.store(prompt, value_str)
        except Exception as e:
-            print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")
+            print_verbose(
+                f"Error setting {value_str or value} in the Redis semantic cache: {str(e)}"
+            )

    def get_cache(self, key: str, **kwargs) -> Any:
        """
@ -336,13 +348,13 @@ class RedisSemanticCache(BaseCache):
                    prompt,
                    value_str,
                    vector=prompt_embedding,  # Pass through custom embedding
-                    ttl=ttl
+                    ttl=ttl,
                )
            else:
                await self.llmcache.astore(
                    prompt,
                    value_str,
-                    vector=prompt_embedding  # Pass through custom embedding
+                    vector=prompt_embedding,  # Pass through custom embedding
                )
        except Exception as e:
            print_verbose(f"Error in async_set_cache: {str(e)}")
@ -374,14 +386,13 @@ class RedisSemanticCache(BaseCache):
            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)

            # Check the cache for semantically similar prompts
-            results = await self.llmcache.acheck(
-                prompt=prompt,
-                vector=prompt_embedding
-            )
+            results = await self.llmcache.acheck(prompt=prompt, vector=prompt_embedding)

            # handle results / cache hit
            if not results:
-                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
+                kwargs.setdefault("metadata", {})[
+                    "semantic-similarity"
+                ] = 0.0  # TODO why here but not above??
                return None

            cache_hit = results[0]
@ -420,7 +431,9 @@ class RedisSemanticCache(BaseCache):
        aindex = await self.llmcache._get_async_index()
        return await aindex.info()

-    async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
+    async def async_set_cache_pipeline(
+        self, cache_list: List[Tuple[str, Any]], **kwargs
+    ) -> None:
        """
        Asynchronously store multiple values in the semantic cache.

--- a/litellm/caching/s3_cache.py
+++ b/litellm/caching/s3_cache.py
@ -123,7 +123,7 @@ class S3Cache(BaseCache):
                    )  # Convert string to dictionary
                except Exception:
                    cached_response = ast.literal_eval(cached_response)
-            if type(cached_response) is not dict:
+            if not isinstance(cached_response, dict):
                cached_response = dict(cached_response)
            verbose_logger.debug(
                f"Got S3 Cache: key: {key}, cached_response {cached_response}. Type Response {type(cached_response)}"
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -4,9 +4,11 @@ ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
 DEFAULT_MAX_RETRIES = 2
+DEFAULT_MAX_RECURSE_DEPTH = 10
 DEFAULT_FAILURE_THRESHOLD_PERCENT = (
    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
+DEFAULT_MAX_TOKENS = 4096
 DEFAULT_REDIS_SYNC_INTERVAL = 1
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
@ -16,6 +18,8 @@ DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
+REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
+MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 #### Networking settings ####
@ -414,6 +418,7 @@ RESPONSE_FORMAT_TOOL_NAME = "json_tool_call"  # default tool name used when conv

 ########################### Logging Callback Constants ###########################
 AZURE_STORAGE_MSFT_VERSION = "2019-07-07"
+MCP_TOOL_NAME_PREFIX = "mcp_tool"

 ########################### LiteLLM Proxy Specific Constants ###########################
 ########################################################################################
@ -441,3 +446,7 @@ HEALTH_CHECK_TIMEOUT_SECONDS = 60  # 60 seconds

 UI_SESSION_TOKEN_TEAM_ID = "litellm-dashboard"
 LITELLM_PROXY_ADMIN_NAME = "default_user_id"
+
+########################### DB CRON JOB NAMES ###########################
+DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
+DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60  # 1 minute
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -2,7 +2,7 @@
 ## File for 'response_cost' calculation in Logging
 import time
 from functools import lru_cache
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, List, Literal, Optional, Tuple, Union, cast

 from pydantic import BaseModel

@ -275,15 +275,13 @@ def cost_per_token(  # noqa: PLR0915
                custom_llm_provider=custom_llm_provider,
                prompt_characters=prompt_characters,
                completion_characters=completion_characters,
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
+                usage=usage_block,
            )
        elif cost_router == "cost_per_token":
            return google_cost_per_token(
                model=model_without_prefix,
                custom_llm_provider=custom_llm_provider,
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
+                usage=usage_block,
            )
    elif custom_llm_provider == "anthropic":
        return anthropic_cost_per_token(model=model, usage=usage_block)
@ -464,13 +462,36 @@ def _model_contains_known_llm_provider(model: str) -> bool:
 def _get_usage_object(
    completion_response: Any,
 ) -> Optional[Usage]:
-    usage_obj: Optional[Usage] = None
-    if completion_response is not None and isinstance(
-        completion_response, ModelResponse
-    ):
-        usage_obj = completion_response.get("usage")
+    usage_obj = cast(
+        Union[Usage, ResponseAPIUsage, dict, BaseModel],
+        (
+            completion_response.get("usage")
+            if isinstance(completion_response, dict)
+            else getattr(completion_response, "get", lambda x: None)("usage")
+        ),
+    )

+    if usage_obj is None:
+        return None
+    if isinstance(usage_obj, Usage):
        return usage_obj
+    elif (
+        usage_obj is not None
+        and (isinstance(usage_obj, dict) or isinstance(usage_obj, ResponseAPIUsage))
+        and ResponseAPILoggingUtils._is_response_api_usage(usage_obj)
+    ):
+        return ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+            usage_obj
+        )
+    elif isinstance(usage_obj, dict):
+        return Usage(**usage_obj)
+    elif isinstance(usage_obj, BaseModel):
+        return Usage(**usage_obj.model_dump())
+    else:
+        verbose_logger.debug(
+            f"Unknown usage object type: {type(usage_obj)}, usage_obj: {usage_obj}"
+        )
+        return None


 def _is_known_usage_objects(usage_obj):
@ -559,7 +580,6 @@ def completion_cost(  # noqa: PLR0915
        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
    """
    try:
-
        call_type = _infer_call_type(call_type, completion_response) or "completion"

        if (
@ -664,6 +684,7 @@ def completion_cost(  # noqa: PLR0915
            elif len(prompt) > 0:
                prompt_tokens = token_counter(model=model, text=prompt)
            completion_tokens = token_counter(model=model, text=completion)
+
        if model is None:
            raise ValueError(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
@ -828,11 +849,14 @@ def get_response_cost_from_hidden_params(
        _hidden_params_dict = hidden_params

    additional_headers = _hidden_params_dict.get("additional_headers", {})
-    if additional_headers and "x-litellm-response-cost" in additional_headers:
-        response_cost = additional_headers["x-litellm-response-cost"]
+    if (
+        additional_headers
+        and "llm_provider-x-litellm-response-cost" in additional_headers
+    ):
+        response_cost = additional_headers["llm_provider-x-litellm-response-cost"]
        if response_cost is None:
            return None
-        return float(additional_headers["x-litellm-response-cost"])
+        return float(additional_headers["llm_provider-x-litellm-response-cost"])
    return None


--- a/litellm/experimental_mcp_client/tools.py
+++ b/litellm/experimental_mcp_client/tools.py
@ -1,5 +1,5 @@
 import json
-from typing import List, Literal, Union
+from typing import Dict, List, Literal, Union

 from mcp import ClientSession
 from mcp.types import CallToolRequestParams as MCPCallToolRequestParams
@ -76,8 +76,8 @@ def _get_function_arguments(function: FunctionDefinition) -> dict:
    return arguments if isinstance(arguments, dict) else {}


-def _transform_openai_tool_call_to_mcp_tool_call_request(
-    openai_tool: ChatCompletionMessageToolCall,
+def transform_openai_tool_call_request_to_mcp_tool_call_request(
+    openai_tool: Union[ChatCompletionMessageToolCall, Dict],
 ) -> MCPCallToolRequestParams:
    """Convert an OpenAI ChatCompletionMessageToolCall to an MCP CallToolRequestParams."""
    function = openai_tool["function"]
@ -100,9 +100,11 @@ async def call_openai_tool(
    Returns:
        The result of the MCP tool call.
    """
-    mcp_tool_call_request_params = _transform_openai_tool_call_to_mcp_tool_call_request(
+    mcp_tool_call_request_params = (
+        transform_openai_tool_call_request_to_mcp_tool_call_request(
            openai_tool=openai_tool,
        )
+    )
    return await call_mcp_tool(
        session=session,
        call_tool_request_params=mcp_tool_call_request_params,
--- a/litellm/fine_tuning/main.py
+++ b/litellm/fine_tuning/main.py
@ -138,7 +138,6 @@ def create_fine_tuning_job(

        # OpenAI
        if custom_llm_provider == "openai":
-
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
                optional_params.api_base
@ -360,7 +359,6 @@ def cancel_fine_tuning_job(

        # OpenAI
        if custom_llm_provider == "openai":
-
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
                optional_params.api_base
@ -522,7 +520,6 @@ def list_fine_tuning_jobs(

        # OpenAI
        if custom_llm_provider == "openai":
-
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
                optional_params.api_base
--- a/litellm/integrations/SlackAlerting/batching_handler.py
+++ b/litellm/integrations/SlackAlerting/batching_handler.py
@ -19,7 +19,6 @@ else:


 def squash_payloads(queue):
-
    squashed = {}
    if len(queue) == 0:
        return squashed
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -195,13 +195,16 @@ class SlackAlerting(CustomBatchLogger):
        if self.alerting is None or self.alert_types is None:
            return

-        time_difference_float, model, api_base, messages = (
-            self._response_taking_too_long_callback_helper(
+        (
+            time_difference_float,
+            model,
+            api_base,
+            messages,
+        ) = self._response_taking_too_long_callback_helper(
            kwargs=kwargs,
            start_time=start_time,
            end_time=end_time,
        )
-        )
        if litellm.turn_off_message_logging or litellm.redact_messages_in_exceptions:
            messages = "Message not logged. litellm.redact_messages_in_exceptions=True"
        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
@ -819,9 +822,9 @@ class SlackAlerting(CustomBatchLogger):
        ### UNIQUE CACHE KEY ###
        cache_key = provider + region_name

-        outage_value: Optional[ProviderRegionOutageModel] = (
-            await self.internal_usage_cache.async_get_cache(key=cache_key)
-        )
+        outage_value: Optional[
+            ProviderRegionOutageModel
+        ] = await self.internal_usage_cache.async_get_cache(key=cache_key)

        if (
            getattr(exception, "status_code", None) is None
@ -1402,9 +1405,9 @@ Model Info:
            self.alert_to_webhook_url is not None
            and alert_type in self.alert_to_webhook_url
        ):
-            slack_webhook_url: Optional[Union[str, List[str]]] = (
-                self.alert_to_webhook_url[alert_type]
-            )
+            slack_webhook_url: Optional[
+                Union[str, List[str]]
+            ] = self.alert_to_webhook_url[alert_type]
        elif self.default_webhook_url is not None:
            slack_webhook_url = self.default_webhook_url
        else:
@ -1768,7 +1771,6 @@ Model Info:
        - Team Created, Updated, Deleted
        """
        try:
-
            message = f"`{event_name}`\n"

            key_event_dict = key_event.model_dump()
--- a/litellm/integrations/argilla.py
+++ b/litellm/integrations/argilla.py
@ -98,7 +98,6 @@ class ArgillaLogger(CustomBatchLogger):
        argilla_dataset_name: Optional[str],
        argilla_base_url: Optional[str],
    ) -> ArgillaCredentialsObject:
-
        _credentials_api_key = argilla_api_key or os.getenv("ARGILLA_API_KEY")
        if _credentials_api_key is None:
            raise Exception("Invalid Argilla API Key given. _credentials_api_key=None.")
--- a/litellm/integrations/arize/_utils.py
+++ b/litellm/integrations/arize/_utils.py
@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union

 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
@ -7,7 +7,7 @@ from litellm.types.utils import StandardLoggingPayload
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/integrations/arize/arize.py
+++ b/litellm/integrations/arize/arize.py
@ -19,14 +19,13 @@ if TYPE_CHECKING:
    from litellm.types.integrations.arize import Protocol as _Protocol

    Protocol = _Protocol
-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Protocol = Any
    Span = Any


 class ArizeLogger(OpenTelemetry):
-
    def set_attributes(self, span: Span, kwargs, response_obj: Optional[Any]):
        ArizeLogger.set_arize_attributes(span, kwargs, response_obj)
        return
--- a/litellm/integrations/arize/arize_phoenix.py
+++ b/litellm/integrations/arize/arize_phoenix.py
@ -1,17 +1,20 @@
 import os
-from typing import TYPE_CHECKING, Any
-from litellm.integrations.arize import _utils
+from typing import TYPE_CHECKING, Any, Union
+
 from litellm._logging import verbose_logger
+from litellm.integrations.arize import _utils
 from litellm.types.integrations.arize_phoenix import ArizePhoenixConfig

 if TYPE_CHECKING:
-    from .opentelemetry import OpenTelemetryConfig as _OpenTelemetryConfig
-    from litellm.types.integrations.arize import Protocol as _Protocol
    from opentelemetry.trace import Span as _Span

+    from litellm.types.integrations.arize import Protocol as _Protocol
+
+    from .opentelemetry import OpenTelemetryConfig as _OpenTelemetryConfig
+
    Protocol = _Protocol
    OpenTelemetryConfig = _OpenTelemetryConfig
-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Protocol = Any
    OpenTelemetryConfig = Any
@ -20,6 +23,7 @@ else:

 ARIZE_HOSTED_PHOENIX_ENDPOINT = "https://app.phoenix.arize.com/v1/traces"

+
 class ArizePhoenixLogger:
    @staticmethod
    def set_arize_phoenix_attributes(span: Span, kwargs, response_obj):
@ -59,15 +63,14 @@ class ArizePhoenixLogger:
        # a slightly different auth header format than self hosted phoenix
        if endpoint == ARIZE_HOSTED_PHOENIX_ENDPOINT:
            if api_key is None:
-                raise ValueError("PHOENIX_API_KEY must be set when the Arize hosted Phoenix endpoint is used.")
+                raise ValueError(
+                    "PHOENIX_API_KEY must be set when the Arize hosted Phoenix endpoint is used."
+                )
            otlp_auth_headers = f"api_key={api_key}"
        elif api_key is not None:
            # api_key/auth is optional for self hosted phoenix
            otlp_auth_headers = f"Authorization=Bearer {api_key}"

        return ArizePhoenixConfig(
-            otlp_auth_headers=otlp_auth_headers,
-            protocol=protocol,
-            endpoint=endpoint
+            otlp_auth_headers=otlp_auth_headers, protocol=protocol, endpoint=endpoint
        )
-
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -12,7 +12,10 @@ class AthinaLogger:
            "athina-api-key": self.athina_api_key,
            "Content-Type": "application/json",
        }
-        self.athina_logging_url = os.getenv("ATHINA_BASE_URL", "https://log.athina.ai") + "/api/v1/log/inference"
+        self.athina_logging_url = (
+            os.getenv("ATHINA_BASE_URL", "https://log.athina.ai")
+            + "/api/v1/log/inference"
+        )
        self.additional_keys = [
            "environment",
            "prompt_slug",
--- a/litellm/integrations/azure_storage/azure_storage.py
+++ b/litellm/integrations/azure_storage/azure_storage.py
@ -50,12 +50,12 @@ class AzureBlobStorageLogger(CustomBatchLogger):
            self.azure_storage_file_system: str = _azure_storage_file_system

            # Internal variables used for Token based authentication
-            self.azure_auth_token: Optional[str] = (
-                None  # the Azure AD token to use for Azure Storage API requests
-            )
-            self.token_expiry: Optional[datetime] = (
-                None  # the expiry time of the currentAzure AD token
-            )
+            self.azure_auth_token: Optional[
+                str
+            ] = None  # the Azure AD token to use for Azure Storage API requests
+            self.token_expiry: Optional[
+                datetime
+            ] = None  # the expiry time of the currentAzure AD token

            asyncio.create_task(self.periodic_flush())
            self.flush_lock = asyncio.Lock()
@ -153,7 +153,6 @@ class AzureBlobStorageLogger(CustomBatchLogger):
        3. Flush the data
        """
        try:
-
            if self.azure_storage_account_key:
                await self.upload_to_azure_data_lake_with_azure_account_key(
                    payload=payload
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -4,7 +4,7 @@
 import copy
 import os
 from datetime import datetime
-from typing import Optional, Dict
+from typing import Dict, Optional

 import httpx
 from pydantic import BaseModel
@ -19,7 +19,9 @@ from litellm.llms.custom_httpx.http_handler import (
 )
 from litellm.utils import print_verbose

-global_braintrust_http_handler = get_async_httpx_client(llm_provider=httpxSpecialProvider.LoggingCallback)
+global_braintrust_http_handler = get_async_httpx_client(
+    llm_provider=httpxSpecialProvider.LoggingCallback
+)
 global_braintrust_sync_http_handler = HTTPHandler()
 API_BASE = "https://api.braintrustdata.com/v1"

@ -35,7 +37,9 @@ def get_utc_datetime():


 class BraintrustLogger(CustomLogger):
-    def __init__(self, api_key: Optional[str] = None, api_base: Optional[str] = None) -> None:
+    def __init__(
+        self, api_key: Optional[str] = None, api_base: Optional[str] = None
+    ) -> None:
        super().__init__()
        self.validate_environment(api_key=api_key)
        self.api_base = api_base or API_BASE
@ -45,7 +49,9 @@ class BraintrustLogger(CustomLogger):
            "Authorization": "Bearer " + self.api_key,
            "Content-Type": "application/json",
        }
-        self._project_id_cache: Dict[str, str] = {}  # Cache mapping project names to IDs
+        self._project_id_cache: Dict[
+            str, str
+        ] = {}  # Cache mapping project names to IDs

    def validate_environment(self, api_key: Optional[str]):
        """
@ -71,7 +77,9 @@ class BraintrustLogger(CustomLogger):

        try:
            response = global_braintrust_sync_http_handler.post(
-                f"{self.api_base}/project", headers=self.headers, json={"name": project_name}
+                f"{self.api_base}/project",
+                headers=self.headers,
+                json={"name": project_name},
            )
            project_dict = response.json()
            project_id = project_dict["id"]
@ -89,7 +97,9 @@ class BraintrustLogger(CustomLogger):

        try:
            response = await global_braintrust_http_handler.post(
-                f"{self.api_base}/project/register", headers=self.headers, json={"name": project_name}
+                f"{self.api_base}/project/register",
+                headers=self.headers,
+                json={"name": project_name},
            )
            project_dict = response.json()
            project_id = project_dict["id"]
@ -116,15 +126,21 @@ class BraintrustLogger(CustomLogger):
        if metadata is None:
            metadata = {}

-        proxy_headers = litellm_params.get("proxy_server_request", {}).get("headers", {}) or {}
+        proxy_headers = (
+            litellm_params.get("proxy_server_request", {}).get("headers", {}) or {}
+        )

        for metadata_param_key in proxy_headers:
            if metadata_param_key.startswith("braintrust"):
                trace_param_key = metadata_param_key.replace("braintrust", "", 1)
                if trace_param_key in metadata:
-                    verbose_logger.warning(f"Overwriting Braintrust `{trace_param_key}` from request header")
+                    verbose_logger.warning(
+                        f"Overwriting Braintrust `{trace_param_key}` from request header"
+                    )
                else:
-                    verbose_logger.debug(f"Found Braintrust `{trace_param_key}` in request header")
+                    verbose_logger.debug(
+                        f"Found Braintrust `{trace_param_key}` in request header"
+                    )
                metadata[trace_param_key] = proxy_headers.get(metadata_param_key)

        return metadata
@ -157,24 +173,35 @@ class BraintrustLogger(CustomLogger):
            output = None
            choices = []
            if response_obj is not None and (
-                kwargs.get("call_type", None) == "embedding" or isinstance(response_obj, litellm.EmbeddingResponse)
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                output = None
-            elif response_obj is not None and isinstance(response_obj, litellm.ModelResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ModelResponse
+            ):
                output = response_obj["choices"][0]["message"].json()
                choices = response_obj["choices"]
-            elif response_obj is not None and isinstance(response_obj, litellm.TextCompletionResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.TextCompletionResponse
+            ):
                output = response_obj.choices[0].text
                choices = response_obj.choices
-            elif response_obj is not None and isinstance(response_obj, litellm.ImageResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ImageResponse
+            ):
                output = response_obj["data"]

            litellm_params = kwargs.get("litellm_params", {})
-            metadata = litellm_params.get("metadata", {}) or {}  # if litellm_params['metadata'] == None
+            metadata = (
+                litellm_params.get("metadata", {}) or {}
+            )  # if litellm_params['metadata'] == None
            metadata = self.add_metadata_from_header(litellm_params, metadata)
            clean_metadata = {}
            try:
-                metadata = copy.deepcopy(metadata)  # Avoid modifying the original metadata
+                metadata = copy.deepcopy(
+                    metadata
+                )  # Avoid modifying the original metadata
            except Exception:
                new_metadata = {}
                for key, value in metadata.items():
@ -192,7 +219,9 @@ class BraintrustLogger(CustomLogger):
            project_id = metadata.get("project_id")
            if project_id is None:
                project_name = metadata.get("project_name")
-                project_id = self.get_project_id_sync(project_name) if project_name else None
+                project_id = (
+                    self.get_project_id_sync(project_name) if project_name else None
+                )

            if project_id is None:
                if self.default_project_id is None:
@ -234,7 +263,8 @@ class BraintrustLogger(CustomLogger):
                    "completion_tokens": usage_obj.completion_tokens,
                    "total_tokens": usage_obj.total_tokens,
                    "total_cost": cost,
-                    "time_to_first_token": end_time.timestamp() - start_time.timestamp(),
+                    "time_to_first_token": end_time.timestamp()
+                    - start_time.timestamp(),
                    "start": start_time.timestamp(),
                    "end": end_time.timestamp(),
                }
@ -255,7 +285,9 @@ class BraintrustLogger(CustomLogger):
                request_data["metrics"] = metrics

            try:
-                print_verbose(f"global_braintrust_sync_http_handler.post: {global_braintrust_sync_http_handler.post}")
+                print_verbose(
+                    f"global_braintrust_sync_http_handler.post: {global_braintrust_sync_http_handler.post}"
+                )
                global_braintrust_sync_http_handler.post(
                    url=f"{self.api_base}/project_logs/{project_id}/insert",
                    json={"events": [request_data]},
@ -276,20 +308,29 @@ class BraintrustLogger(CustomLogger):
            output = None
            choices = []
            if response_obj is not None and (
-                kwargs.get("call_type", None) == "embedding" or isinstance(response_obj, litellm.EmbeddingResponse)
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                output = None
-            elif response_obj is not None and isinstance(response_obj, litellm.ModelResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ModelResponse
+            ):
                output = response_obj["choices"][0]["message"].json()
                choices = response_obj["choices"]
-            elif response_obj is not None and isinstance(response_obj, litellm.TextCompletionResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.TextCompletionResponse
+            ):
                output = response_obj.choices[0].text
                choices = response_obj.choices
-            elif response_obj is not None and isinstance(response_obj, litellm.ImageResponse):
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ImageResponse
+            ):
                output = response_obj["data"]

            litellm_params = kwargs.get("litellm_params", {})
-            metadata = litellm_params.get("metadata", {}) or {}  # if litellm_params['metadata'] == None
+            metadata = (
+                litellm_params.get("metadata", {}) or {}
+            )  # if litellm_params['metadata'] == None
            metadata = self.add_metadata_from_header(litellm_params, metadata)
            clean_metadata = {}
            new_metadata = {}
@ -313,7 +354,11 @@ class BraintrustLogger(CustomLogger):
            project_id = metadata.get("project_id")
            if project_id is None:
                project_name = metadata.get("project_name")
-                project_id = await self.get_project_id_async(project_name) if project_name else None
+                project_id = (
+                    await self.get_project_id_async(project_name)
+                    if project_name
+                    else None
+                )

            if project_id is None:
                if self.default_project_id is None:
@ -362,8 +407,14 @@ class BraintrustLogger(CustomLogger):
                api_call_start_time = kwargs.get("api_call_start_time")
                completion_start_time = kwargs.get("completion_start_time")

-                if api_call_start_time is not None and completion_start_time is not None:
-                    metrics["time_to_first_token"] = completion_start_time.timestamp() - api_call_start_time.timestamp()
+                if (
+                    api_call_start_time is not None
+                    and completion_start_time is not None
+                ):
+                    metrics["time_to_first_token"] = (
+                        completion_start_time.timestamp()
+                        - api_call_start_time.timestamp()
+                    )

            request_data = {
                "id": litellm_call_id,
--- a/litellm/integrations/custom_batch_logger.py
+++ b/litellm/integrations/custom_batch_logger.py
@ -14,7 +14,6 @@ from litellm.integrations.custom_logger import CustomLogger


 class CustomBatchLogger(CustomLogger):
-
    def __init__(
        self,
        flush_lock: Optional[asyncio.Lock] = None,
--- a/litellm/integrations/custom_guardrail.py
+++ b/litellm/integrations/custom_guardrail.py
@ -7,7 +7,6 @@ from litellm.types.utils import StandardLoggingGuardrailInformation


 class CustomGuardrail(CustomLogger):
-
    def __init__(
        self,
        guardrail_name: Optional[str] = None,
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -31,7 +31,7 @@ from litellm.types.utils import (
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -233,7 +233,6 @@ class DataDogLogger(
        pass

    async def _log_async_event(self, kwargs, response_obj, start_time, end_time):
-
        dd_payload = self.create_datadog_logging_payload(
            kwargs=kwargs,
            response_obj=response_obj,
--- a/litellm/integrations/gcs_bucket/gcs_bucket_base.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket_base.py
@ -125,9 +125,9 @@ class GCSBucketBase(CustomBatchLogger):
        if kwargs is None:
            kwargs = {}

-        standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = (
-            kwargs.get("standard_callback_dynamic_params", None)
-        )
+        standard_callback_dynamic_params: Optional[
+            StandardCallbackDynamicParams
+        ] = kwargs.get("standard_callback_dynamic_params", None)

        bucket_name: str
        path_service_account: Optional[str]
--- a/litellm/integrations/gcs_pubsub/pub_sub.py
+++ b/litellm/integrations/gcs_pubsub/pub_sub.py
@ -70,13 +70,14 @@ class GcsPubSubLogger(CustomBatchLogger):
        """Construct authorization headers using Vertex AI auth"""
        from litellm import vertex_chat_completion

-        _auth_header, vertex_project = (
-            await vertex_chat_completion._ensure_access_token_async(
+        (
+            _auth_header,
+            vertex_project,
+        ) = await vertex_chat_completion._ensure_access_token_async(
            credentials=self.path_service_account_json,
            project_id=None,
            custom_llm_provider="vertex_ai",
        )
-        )

        auth_header, _ = vertex_chat_completion._get_token_and_url(
            model="pub-sub",
--- a/litellm/integrations/humanloop.py
+++ b/litellm/integrations/humanloop.py
@ -155,11 +155,7 @@ class HumanloopLogger(CustomLogger):
        prompt_id: str,
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[
-        str,
-        List[AllMessageValues],
-        dict,
-    ]:
+    ) -> Tuple[str, List[AllMessageValues], dict,]:
        humanloop_api_key = dynamic_callback_params.get(
            "humanloop_api_key"
        ) or get_secret_str("HUMANLOOP_API_KEY")
--- a/litellm/integrations/langfuse/langfuse.py
+++ b/litellm/integrations/langfuse/langfuse.py
@ -471,9 +471,9 @@ class LangFuseLogger:
            # we clean out all extra litellm metadata params before logging
            clean_metadata: Dict[str, Any] = {}
            if prompt_management_metadata is not None:
-                clean_metadata["prompt_management_metadata"] = (
-                    prompt_management_metadata
-                )
+                clean_metadata[
+                    "prompt_management_metadata"
+                ] = prompt_management_metadata
            if isinstance(metadata, dict):
                for key, value in metadata.items():
                    # generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
--- a/litellm/integrations/langfuse/langfuse_handler.py
+++ b/litellm/integrations/langfuse/langfuse_handler.py
@ -19,7 +19,6 @@ else:


 class LangFuseHandler:
-
    @staticmethod
    def get_langfuse_logger_for_request(
        standard_callback_dynamic_params: StandardCallbackDynamicParams,
@ -87,7 +86,9 @@ class LangFuseHandler:
        if globalLangfuseLogger is not None:
            return globalLangfuseLogger

-        credentials_dict: Dict[str, Any] = (
+        credentials_dict: Dict[
+            str, Any
+        ] = (
            {}
        )  # the global langfuse logger uses Environment Variables, there are no dynamic credentials
        globalLangfuseLogger = in_memory_dynamic_logger_cache.get_cache(
--- a/litellm/integrations/langfuse/langfuse_prompt_management.py
+++ b/litellm/integrations/langfuse/langfuse_prompt_management.py
@ -172,11 +172,7 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
        prompt_id: str,
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[
-        str,
-        List[AllMessageValues],
-        dict,
-    ]:
+    ) -> Tuple[str, List[AllMessageValues], dict,]:
        return self.get_chat_completion_prompt(
            model,
            messages,
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -75,7 +75,6 @@ class LangsmithLogger(CustomBatchLogger):
        langsmith_project: Optional[str] = None,
        langsmith_base_url: Optional[str] = None,
    ) -> LangsmithCredentialsObject:
-
        _credentials_api_key = langsmith_api_key or os.getenv("LANGSMITH_API_KEY")
        if _credentials_api_key is None:
            raise Exception(
@ -443,9 +442,9 @@ class LangsmithLogger(CustomBatchLogger):

        Otherwise, use the default credentials.
        """
-        standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = (
-            kwargs.get("standard_callback_dynamic_params", None)
-        )
+        standard_callback_dynamic_params: Optional[
+            StandardCallbackDynamicParams
+        ] = kwargs.get("standard_callback_dynamic_params", None)
        if standard_callback_dynamic_params is not None:
            credentials = self.get_credentials_from_env(
                langsmith_api_key=standard_callback_dynamic_params.get(
@ -481,7 +480,6 @@ class LangsmithLogger(CustomBatchLogger):
            asyncio.run(self.async_send_batch())

    def get_run_by_id(self, run_id):
-
        langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"]

        langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"]
--- a/litellm/integrations/langtrace.py
+++ b/litellm/integrations/langtrace.py
@ -1,12 +1,12 @@
 import json
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Union

 from litellm.proxy._types import SpanAttributes

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/integrations/lunary.py
+++ b/litellm/integrations/lunary.py
@ -20,7 +20,6 @@ def parse_tool_calls(tool_calls):
        return None

    def clean_tool_call(tool_call):
-
        serialized = {
            "type": tool_call.type,
            "id": tool_call.id,
@ -36,7 +35,6 @@ def parse_tool_calls(tool_calls):


 def parse_messages(input):
-
    if input is None:
        return None

--- a/litellm/integrations/mlflow.py
+++ b/litellm/integrations/mlflow.py
@ -48,14 +48,17 @@ class MlflowLogger(CustomLogger):

    def _extract_and_set_chat_attributes(self, span, kwargs, response_obj):
        try:
-            from mlflow.tracing.utils import set_span_chat_messages, set_span_chat_tools
+            from mlflow.tracing.utils import set_span_chat_messages  # type: ignore
+            from mlflow.tracing.utils import set_span_chat_tools  # type: ignore
        except ImportError:
            return

        inputs = self._construct_input(kwargs)
        input_messages = inputs.get("messages", [])
-        output_messages = [c.message.model_dump(exclude_none=True)
-                           for c in getattr(response_obj, "choices", [])]
+        output_messages = [
+            c.message.model_dump(exclude_none=True)
+            for c in getattr(response_obj, "choices", [])
+        ]
        if messages := [*input_messages, *output_messages]:
            set_span_chat_messages(span, messages)
        if tools := inputs.get("tools"):
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -1,7 +1,7 @@
 import os
 from dataclasses import dataclass
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast

 import litellm
 from litellm._logging import verbose_logger
@ -23,10 +23,10 @@ if TYPE_CHECKING:
    )
    from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth

-    Span = _Span
-    SpanExporter = _SpanExporter
-    UserAPIKeyAuth = _UserAPIKeyAuth
-    ManagementEndpointLoggingPayload = _ManagementEndpointLoggingPayload
+    Span = Union[_Span, Any]
+    SpanExporter = Union[_SpanExporter, Any]
+    UserAPIKeyAuth = Union[_UserAPIKeyAuth, Any]
+    ManagementEndpointLoggingPayload = Union[_ManagementEndpointLoggingPayload, Any]
 else:
    Span = Any
    SpanExporter = Any
@ -46,7 +46,6 @@ LITELLM_REQUEST_SPAN_NAME = "litellm_request"

@dataclass
 class OpenTelemetryConfig:
-
    exporter: Union[str, SpanExporter] = "console"
    endpoint: Optional[str] = None
    headers: Optional[str] = None
@ -154,7 +153,6 @@ class OpenTelemetry(CustomLogger):
        end_time: Optional[Union[datetime, float]] = None,
        event_metadata: Optional[dict] = None,
    ):
-
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

@ -215,7 +213,6 @@ class OpenTelemetry(CustomLogger):
        end_time: Optional[Union[float, datetime]] = None,
        event_metadata: Optional[dict] = None,
    ):
-
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

@ -353,9 +350,9 @@ class OpenTelemetry(CustomLogger):
        """
        from opentelemetry import trace

-        standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = (
-            kwargs.get("standard_callback_dynamic_params")
-        )
+        standard_callback_dynamic_params: Optional[
+            StandardCallbackDynamicParams
+        ] = kwargs.get("standard_callback_dynamic_params")
        if not standard_callback_dynamic_params:
            return

@ -722,7 +719,6 @@ class OpenTelemetry(CustomLogger):
        span.set_attribute(key, primitive_value)

    def set_raw_request_attributes(self, span: Span, kwargs, response_obj):
-
        kwargs.get("optional_params", {})
        litellm_params = kwargs.get("litellm_params", {}) or {}
        custom_llm_provider = litellm_params.get("custom_llm_provider", "Unknown")
@ -843,12 +839,14 @@ class OpenTelemetry(CustomLogger):
            headers=dynamic_headers or self.OTEL_HEADERS
        )

-        if isinstance(self.OTEL_EXPORTER, SpanExporter):
+        if hasattr(
+            self.OTEL_EXPORTER, "export"
+        ):  # Check if it has the export method that SpanExporter requires
            verbose_logger.debug(
                "OpenTelemetry: intiializing SpanExporter. Value of OTEL_EXPORTER: %s",
                self.OTEL_EXPORTER,
            )
-            return SimpleSpanProcessor(self.OTEL_EXPORTER)
+            return SimpleSpanProcessor(cast(SpanExporter, self.OTEL_EXPORTER))

        if self.OTEL_EXPORTER == "console":
            verbose_logger.debug(
@ -907,7 +905,6 @@ class OpenTelemetry(CustomLogger):
        logging_payload: ManagementEndpointLoggingPayload,
        parent_otel_span: Optional[Span] = None,
    ):
-
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

@ -961,7 +958,6 @@ class OpenTelemetry(CustomLogger):
        logging_payload: ManagementEndpointLoggingPayload,
        parent_otel_span: Optional[Span] = None,
    ):
-
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

--- a/litellm/integrations/opik/opik.py
+++ b/litellm/integrations/opik/opik.py
@ -185,7 +185,6 @@ class OpikLogger(CustomBatchLogger):
    def _create_opik_payload(  # noqa: PLR0915
        self, kwargs, response_obj, start_time, end_time
    ) -> List[Dict]:
-
        # Get metadata
        _litellm_params = kwargs.get("litellm_params", {}) or {}
        litellm_params_metadata = _litellm_params.get("metadata", {}) or {}
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -988,9 +988,9 @@ class PrometheusLogger(CustomLogger):
    ):
        try:
            verbose_logger.debug("setting remaining tokens requests metric")
-            standard_logging_payload: Optional[StandardLoggingPayload] = (
-                request_kwargs.get("standard_logging_object")
-            )
+            standard_logging_payload: Optional[
+                StandardLoggingPayload
+            ] = request_kwargs.get("standard_logging_object")

            if standard_logging_payload is None:
                return
--- a/litellm/integrations/prompt_management_base.py
+++ b/litellm/integrations/prompt_management_base.py
@ -14,7 +14,6 @@ class PromptManagementClient(TypedDict):


 class PromptManagementBase(ABC):
-
    @property
    @abstractmethod
    def integration_name(self) -> str:
@ -83,11 +82,7 @@ class PromptManagementBase(ABC):
        prompt_id: str,
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[
-        str,
-        List[AllMessageValues],
-        dict,
-    ]:
+    ) -> Tuple[str, List[AllMessageValues], dict,]:
        if not self.should_run_prompt_management(
            prompt_id=prompt_id, dynamic_callback_params=dynamic_callback_params
        ):
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@ -38,7 +38,7 @@ class S3Logger:
            if litellm.s3_callback_params is not None:
                # read in .env variables - example os.environ/AWS_BUCKET_NAME
                for key, value in litellm.s3_callback_params.items():
-                    if type(value) is str and value.startswith("os.environ/"):
+                    if isinstance(value, str) and value.startswith("os.environ/"):
                        litellm.s3_callback_params[key] = litellm.get_secret(value)
                # now set s3 params from litellm.s3_logger_params
                s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
--- a/litellm/integrations/weights_biases.py
+++ b/litellm/integrations/weights_biases.py
@ -21,11 +21,11 @@ try:
        # contains a (known) object attribute
        object: Literal["chat.completion", "edit", "text_completion"]

-        def __getitem__(self, key: K) -> V: ...  # noqa
+        def __getitem__(self, key: K) -> V:
+            ...  # noqa

-        def get(  # noqa
-            self, key: K, default: Optional[V] = None
-        ) -> Optional[V]: ...  # pragma: no cover
+        def get(self, key: K, default: Optional[V] = None) -> Optional[V]:  # noqa
+            ...  # pragma: no cover

    class OpenAIRequestResponseResolver:
        def __call__(
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -10,7 +10,7 @@ from litellm.types.llms.openai import AllMessageValues
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

-    Span = _Span
+    Span = Union[_Span, Any]
 else:
    Span = Any

--- a/litellm/litellm_core_utils/default_encoding.py
+++ b/litellm/litellm_core_utils/default_encoding.py
@ -11,7 +11,9 @@ except (ImportError, AttributeError):
    # Old way to access resources, which setuptools deprecated some time ago
    import pkg_resources  # type: ignore

-    filename = pkg_resources.resource_filename(__name__, "litellm_core_utils/tokenizers")
+    filename = pkg_resources.resource_filename(
+        __name__, "litellm_core_utils/tokenizers"
+    )

 os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
    "CUSTOM_TIKTOKEN_CACHE_DIR", filename
--- a/litellm/litellm_core_utils/get_supported_openai_params.py
+++ b/litellm/litellm_core_utils/get_supported_openai_params.py
@ -79,6 +79,22 @@ def get_supported_openai_params(  # noqa: PLR0915
    elif custom_llm_provider == "maritalk":
        return litellm.MaritalkConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "openai":
+        if request_type == "transcription":
+            transcription_provider_config = (
+                litellm.ProviderConfigManager.get_provider_audio_transcription_config(
+                    model=model, provider=LlmProviders.OPENAI
+                )
+            )
+            if isinstance(
+                transcription_provider_config, litellm.OpenAIGPTAudioTranscriptionConfig
+            ):
+                return transcription_provider_config.get_supported_openai_params(
+                    model=model
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported provider config: {transcription_provider_config} for model: {model}"
+                )
        return litellm.OpenAIConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "azure":
        if litellm.AzureOpenAIO1Config().is_o_series_model(model=model):
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -67,6 +67,7 @@ from litellm.types.utils import (
    StandardCallbackDynamicParams,
    StandardLoggingAdditionalHeaders,
    StandardLoggingHiddenParams,
+    StandardLoggingMCPToolCall,
    StandardLoggingMetadata,
    StandardLoggingModelCostFailureDebugInformation,
    StandardLoggingModelInformation,
@ -239,9 +240,9 @@ class Logging(LiteLLMLoggingBaseClass):
        self.litellm_trace_id = litellm_trace_id
        self.function_id = function_id
        self.streaming_chunks: List[Any] = []  # for generating complete stream response
-        self.sync_streaming_chunks: List[Any] = (
-            []
-        )  # for generating complete stream response
+        self.sync_streaming_chunks: List[
+            Any
+        ] = []  # for generating complete stream response
        self.log_raw_request_response = log_raw_request_response

        # Initialize dynamic callbacks
@ -452,11 +453,13 @@ class Logging(LiteLLMLoggingBaseClass):
        prompt_id: str,
        prompt_variables: Optional[dict],
    ) -> Tuple[str, List[AllMessageValues], dict]:
-
        custom_logger = self.get_custom_logger_for_prompt_management(model)
        if custom_logger:
-            model, messages, non_default_params = (
-                custom_logger.get_chat_completion_prompt(
+            (
+                model,
+                messages,
+                non_default_params,
+            ) = custom_logger.get_chat_completion_prompt(
                model=model,
                messages=messages,
                non_default_params=non_default_params,
@ -464,7 +467,6 @@ class Logging(LiteLLMLoggingBaseClass):
                prompt_variables=prompt_variables,
                dynamic_callback_params=self.standard_callback_dynamic_params,
            )
-            )
        self.messages = messages
        return model, messages, non_default_params

@ -541,12 +543,11 @@ class Logging(LiteLLMLoggingBaseClass):
            model
        ):  # if model name was changes pre-call, overwrite the initial model call name with the new one
            self.model_call_details["model"] = model
-        self.model_call_details["litellm_params"]["api_base"] = (
-            self._get_masked_api_base(additional_args.get("api_base", ""))
-        )
+        self.model_call_details["litellm_params"][
+            "api_base"
+        ] = self._get_masked_api_base(additional_args.get("api_base", ""))

    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915
-
        # Log the exact input to the LLM API
        litellm.error_logs["PRE_CALL"] = locals()
        try:
@ -568,19 +569,16 @@ class Logging(LiteLLMLoggingBaseClass):
                self.log_raw_request_response is True
                or log_raw_request_response is True
            ):
-
                _litellm_params = self.model_call_details.get("litellm_params", {})
                _metadata = _litellm_params.get("metadata", {}) or {}
                try:
                    # [Non-blocking Extra Debug Information in metadata]
                    if turn_off_message_logging is True:
-
-                        _metadata["raw_request"] = (
-                            "redacted by litellm. \
+                        _metadata[
+                            "raw_request"
+                        ] = "redacted by litellm. \
                            'litellm.turn_off_message_logging=True'"
-                        )
                    else:
-
                        curl_command = self._get_request_curl_command(
                            api_base=additional_args.get("api_base", ""),
                            headers=additional_args.get("headers", {}),
@ -590,8 +588,9 @@ class Logging(LiteLLMLoggingBaseClass):

                        _metadata["raw_request"] = str(curl_command)
                        # split up, so it's easier to parse in the UI
-                        self.model_call_details["raw_request_typed_dict"] = (
-                            RawRequestTypedDict(
+                        self.model_call_details[
+                            "raw_request_typed_dict"
+                        ] = RawRequestTypedDict(
                            raw_request_api_base=str(
                                additional_args.get("api_base") or ""
                            ),
@ -604,20 +603,19 @@ class Logging(LiteLLMLoggingBaseClass):
                            ),
                            error=None,
                        )
-                        )
                except Exception as e:
-                    self.model_call_details["raw_request_typed_dict"] = (
-                        RawRequestTypedDict(
+                    self.model_call_details[
+                        "raw_request_typed_dict"
+                    ] = RawRequestTypedDict(
                        error=str(e),
                    )
-                    )
                    traceback.print_exc()
-                    _metadata["raw_request"] = (
-                        "Unable to Log \
+                    _metadata[
+                        "raw_request"
+                    ] = "Unable to Log \
                        raw request: {}".format(
                        str(e)
                    )
-                    )
            if self.logger_fn and callable(self.logger_fn):
                try:
                    self.logger_fn(
@ -941,9 +939,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details["response_cost_failure_debug_information"] = (
-                debug_info
-            )
+            self.model_call_details[
+                "response_cost_failure_debug_information"
+            ] = debug_info
            return None

        try:
@ -968,9 +966,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details["response_cost_failure_debug_information"] = (
-                debug_info
-            )
+            self.model_call_details[
+                "response_cost_failure_debug_information"
+            ] = debug_info

        return None

@ -995,7 +993,6 @@ class Logging(LiteLLMLoggingBaseClass):
    def should_run_callback(
        self, callback: litellm.CALLBACK_TYPES, litellm_params: dict, event_hook: str
    ) -> bool:
-
        if litellm.global_disable_no_log_param:
            return True

@ -1027,9 +1024,9 @@ class Logging(LiteLLMLoggingBaseClass):
                end_time = datetime.datetime.now()
            if self.completion_start_time is None:
                self.completion_start_time = end_time
-                self.model_call_details["completion_start_time"] = (
-                    self.completion_start_time
-                )
+                self.model_call_details[
+                    "completion_start_time"
+                ] = self.completion_start_time
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
@ -1083,13 +1080,14 @@ class Logging(LiteLLMLoggingBaseClass):
                            "response_cost"
                        ]
                    else:
-                        self.model_call_details["response_cost"] = (
-                            self._response_cost_calculator(result=result)
-                        )
+                        self.model_call_details[
+                            "response_cost"
+                        ] = self._response_cost_calculator(result=result)
                    ## STANDARDIZED LOGGING PAYLOAD

-                    self.model_call_details["standard_logging_object"] = (
-                        get_standard_logging_object_payload(
+                    self.model_call_details[
+                        "standard_logging_object"
+                    ] = get_standard_logging_object_payload(
                        kwargs=self.model_call_details,
                        init_response_obj=result,
                        start_time=start_time,
@ -1098,11 +1096,11 @@ class Logging(LiteLLMLoggingBaseClass):
                        status="success",
                        standard_built_in_tools_params=self.standard_built_in_tools_params,
                    )
-                    )
-                elif isinstance(result, dict):  # pass-through endpoints
+                elif isinstance(result, dict) or isinstance(result, list):
                    ## STANDARDIZED LOGGING PAYLOAD
-                    self.model_call_details["standard_logging_object"] = (
-                        get_standard_logging_object_payload(
+                    self.model_call_details[
+                        "standard_logging_object"
+                    ] = get_standard_logging_object_payload(
                        kwargs=self.model_call_details,
                        init_response_obj=result,
                        start_time=start_time,
@ -1111,11 +1109,10 @@ class Logging(LiteLLMLoggingBaseClass):
                        status="success",
                        standard_built_in_tools_params=self.standard_built_in_tools_params,
                    )
-                    )
            elif standard_logging_object is not None:
-                self.model_call_details["standard_logging_object"] = (
-                    standard_logging_object
-                )
+                self.model_call_details[
+                    "standard_logging_object"
+                ] = standard_logging_object
            else:  # streaming chunks + image gen.
                self.model_call_details["response_cost"] = None

@ -1154,7 +1151,6 @@ class Logging(LiteLLMLoggingBaseClass):
            standard_logging_object=kwargs.get("standard_logging_object", None),
        )
        try:
-
            ## BUILD COMPLETE STREAMED RESPONSE
            complete_streaming_response: Optional[
                Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]
@ -1172,15 +1168,16 @@ class Logging(LiteLLMLoggingBaseClass):
                verbose_logger.debug(
                    "Logging Details LiteLLM-Success Call streaming complete"
                )
-                self.model_call_details["complete_streaming_response"] = (
-                    complete_streaming_response
-                )
-                self.model_call_details["response_cost"] = (
-                    self._response_cost_calculator(result=complete_streaming_response)
-                )
+                self.model_call_details[
+                    "complete_streaming_response"
+                ] = complete_streaming_response
+                self.model_call_details[
+                    "response_cost"
+                ] = self._response_cost_calculator(result=complete_streaming_response)
                ## STANDARDIZED LOGGING PAYLOAD
-                self.model_call_details["standard_logging_object"] = (
-                    get_standard_logging_object_payload(
+                self.model_call_details[
+                    "standard_logging_object"
+                ] = get_standard_logging_object_payload(
                    kwargs=self.model_call_details,
                    init_response_obj=complete_streaming_response,
                    start_time=start_time,
@ -1189,7 +1186,6 @@ class Logging(LiteLLMLoggingBaseClass):
                    status="success",
                    standard_built_in_tools_params=self.standard_built_in_tools_params,
                )
-                )
            callbacks = self.get_combined_callback_list(
                dynamic_success_callbacks=self.dynamic_success_callbacks,
                global_callbacks=litellm.success_callback,
@ -1207,7 +1203,6 @@ class Logging(LiteLLMLoggingBaseClass):
            ## LOGGING HOOK ##
            for callback in callbacks:
                if isinstance(callback, CustomLogger):
-
                    self.model_call_details, result = callback.logging_hook(
                        kwargs=self.model_call_details,
                        result=result,
@ -1538,11 +1533,11 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details["complete_response"] = (
-                                    self.model_call_details.get(
+                                self.model_call_details[
+                                    "complete_response"
+                                ] = self.model_call_details.get(
                                    "complete_streaming_response", {}
                                )
-                                )
                                result = self.model_call_details["complete_response"]
                            openMeterLogger.log_success_event(
                                kwargs=self.model_call_details,
@ -1581,11 +1576,11 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details["complete_response"] = (
-                                    self.model_call_details.get(
+                                self.model_call_details[
+                                    "complete_response"
+                                ] = self.model_call_details.get(
                                    "complete_streaming_response", {}
                                )
-                                )
                                result = self.model_call_details["complete_response"]

                            callback.log_success_event(
@ -1659,7 +1654,6 @@ class Logging(LiteLLMLoggingBaseClass):
        if self.call_type == CallTypes.aretrieve_batch.value and isinstance(
            result, LiteLLMBatch
        ):
-
            response_cost, batch_usage, batch_models = await _handle_completed_batch(
                batch=result, custom_llm_provider=self.custom_llm_provider
            )
@ -1692,9 +1686,9 @@ class Logging(LiteLLMLoggingBaseClass):
        if complete_streaming_response is not None:
            print_verbose("Async success callbacks: Got a complete streaming response")

-            self.model_call_details["async_complete_streaming_response"] = (
-                complete_streaming_response
-            )
+            self.model_call_details[
+                "async_complete_streaming_response"
+            ] = complete_streaming_response
            try:
                if self.model_call_details.get("cache_hit", False) is True:
                    self.model_call_details["response_cost"] = 0.0
@ -1704,11 +1698,11 @@ class Logging(LiteLLMLoggingBaseClass):
                        model_call_details=self.model_call_details
                    )
                    # base_model defaults to None if not set on model_info
-                    self.model_call_details["response_cost"] = (
-                        self._response_cost_calculator(
+                    self.model_call_details[
+                        "response_cost"
+                    ] = self._response_cost_calculator(
                        result=complete_streaming_response
                    )
-                    )

                verbose_logger.debug(
                    f"Model={self.model}; cost={self.model_call_details['response_cost']}"
@ -1720,8 +1714,9 @@ class Logging(LiteLLMLoggingBaseClass):
                self.model_call_details["response_cost"] = None

            ## STANDARDIZED LOGGING PAYLOAD
-            self.model_call_details["standard_logging_object"] = (
-                get_standard_logging_object_payload(
+            self.model_call_details[
+                "standard_logging_object"
+            ] = get_standard_logging_object_payload(
                kwargs=self.model_call_details,
                init_response_obj=complete_streaming_response,
                start_time=start_time,
@ -1730,7 +1725,6 @@ class Logging(LiteLLMLoggingBaseClass):
                status="success",
                standard_built_in_tools_params=self.standard_built_in_tools_params,
            )
-            )
        callbacks = self.get_combined_callback_list(
            dynamic_success_callbacks=self.dynamic_async_success_callbacks,
            global_callbacks=litellm._async_success_callback,
@ -1935,8 +1929,9 @@ class Logging(LiteLLMLoggingBaseClass):

        ## STANDARDIZED LOGGING PAYLOAD

-        self.model_call_details["standard_logging_object"] = (
-            get_standard_logging_object_payload(
+        self.model_call_details[
+            "standard_logging_object"
+        ] = get_standard_logging_object_payload(
            kwargs=self.model_call_details,
            init_response_obj={},
            start_time=start_time,
@ -1947,7 +1942,6 @@ class Logging(LiteLLMLoggingBaseClass):
            original_exception=exception,
            standard_built_in_tools_params=self.standard_built_in_tools_params,
        )
-        )
        return start_time, end_time

    async def special_failure_handlers(self, exception: Exception):
@ -2084,7 +2078,6 @@ class Logging(LiteLLMLoggingBaseClass):
                        )
                        is not True
                    ):  # custom logger class
-
                        callback.log_failure_event(
                            start_time=start_time,
                            end_time=end_time,
@ -2713,9 +2706,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                endpoint=arize_config.endpoint,
            )

-            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
-                f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
-            )
+            os.environ[
+                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
+            ] = f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, ArizeLogger)
@ -2739,9 +2732,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915

            # auth can be disabled on local deployments of arize phoenix
            if arize_phoenix_config.otlp_auth_headers is not None:
-                os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
-                    arize_phoenix_config.otlp_auth_headers
-                )
+                os.environ[
+                    "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
+                ] = arize_phoenix_config.otlp_auth_headers

            for callback in _in_memory_loggers:
                if (
@ -2832,9 +2825,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                exporter="otlp_http",
                endpoint="https://langtrace.ai/api/trace",
            )
-            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
-                f"api_key={os.getenv('LANGTRACE_API_KEY')}"
-            )
+            os.environ[
+                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
+            ] = f"api_key={os.getenv('LANGTRACE_API_KEY')}"
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, OpenTelemetry)
@ -3114,6 +3107,7 @@ class StandardLoggingPayloadSetup:
        litellm_params: Optional[dict] = None,
        prompt_integration: Optional[str] = None,
        applied_guardrails: Optional[List[str]] = None,
+        mcp_tool_call_metadata: Optional[StandardLoggingMCPToolCall] = None,
    ) -> StandardLoggingMetadata:
        """
        Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
@ -3160,6 +3154,7 @@ class StandardLoggingPayloadSetup:
            user_api_key_end_user_id=None,
            prompt_management_metadata=prompt_management_metadata,
            applied_guardrails=applied_guardrails,
+            mcp_tool_call_metadata=mcp_tool_call_metadata,
        )
        if isinstance(metadata, dict):
            # Filter the metadata dictionary to include only the specified keys
@ -3223,7 +3218,6 @@ class StandardLoggingPayloadSetup:
        custom_llm_provider: Optional[str],
        init_response_obj: Union[Any, BaseModel, dict],
    ) -> StandardLoggingModelInformation:
-
        model_cost_name = _select_model_name_for_cost_calc(
            model=None,
            completion_response=init_response_obj,  # type: ignore
@ -3286,7 +3280,6 @@ class StandardLoggingPayloadSetup:
    def get_additional_headers(
        additiona_headers: Optional[dict],
    ) -> Optional[StandardLoggingAdditionalHeaders]:
-
        if additiona_headers is None:
            return None

@ -3322,11 +3315,11 @@ class StandardLoggingPayloadSetup:
            for key in StandardLoggingHiddenParams.__annotations__.keys():
                if key in hidden_params:
                    if key == "additional_headers":
-                        clean_hidden_params["additional_headers"] = (
-                            StandardLoggingPayloadSetup.get_additional_headers(
+                        clean_hidden_params[
+                            "additional_headers"
+                        ] = StandardLoggingPayloadSetup.get_additional_headers(
                            hidden_params[key]
                        )
-                        )
                    else:
                        clean_hidden_params[key] = hidden_params[key]  # type: ignore
        return clean_hidden_params
@ -3463,13 +3456,15 @@ def get_standard_logging_object_payload(
        )

        # cleanup timestamps
-        start_time_float, end_time_float, completion_start_time_float = (
-            StandardLoggingPayloadSetup.cleanup_timestamps(
+        (
+            start_time_float,
+            end_time_float,
+            completion_start_time_float,
+        ) = StandardLoggingPayloadSetup.cleanup_timestamps(
            start_time=start_time,
            end_time=end_time,
            completion_start_time=completion_start_time,
        )
-        )
        response_time = StandardLoggingPayloadSetup.get_response_time(
            start_time_float=start_time_float,
            end_time_float=end_time_float,
@ -3486,6 +3481,7 @@ def get_standard_logging_object_payload(
            litellm_params=litellm_params,
            prompt_integration=kwargs.get("prompt_integration", None),
            applied_guardrails=kwargs.get("applied_guardrails", None),
+            mcp_tool_call_metadata=kwargs.get("mcp_tool_call_metadata", None),
        )

        _request_body = proxy_server_request.get("body", {})
@ -3495,7 +3491,6 @@ def get_standard_logging_object_payload(

        saved_cache_cost: float = 0.0
        if cache_hit is True:
-
            id = f"{id}_cache_hit{time.time()}"  # do not duplicate the request id
            saved_cache_cost = (
                logging_obj._response_cost_calculator(
@ -3626,6 +3621,7 @@ def get_standard_logging_metadata(
        user_api_key_end_user_id=None,
        prompt_management_metadata=None,
        applied_guardrails=None,
+        mcp_tool_call_metadata=None,
    )
    if isinstance(metadata, dict):
        # Filter the metadata dictionary to include only the specified keys
@ -3658,9 +3654,9 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]):
    ):
        for k, v in metadata["user_api_key_metadata"].items():
            if k == "logging":  # prevent logging user logging keys
-                cleaned_user_api_key_metadata[k] = (
-                    "scrubbed_by_litellm_for_sensitive_keys"
-                )
+                cleaned_user_api_key_metadata[
+                    k
+                ] = "scrubbed_by_litellm_for_sensitive_keys"
            else:
                cleaned_user_api_key_metadata[k] = v

--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -1,7 +1,7 @@
 # What is this?
 ## Helper utilities for cost_per_token()

-from typing import Optional, Tuple
+from typing import Optional, Tuple, cast

 import litellm
 from litellm import verbose_logger
@ -121,6 +121,31 @@ def _get_completion_token_base_cost(model_info: ModelInfo, usage: Usage) -> floa
    return model_info["output_cost_per_token"]


+def calculate_cost_component(
+    model_info: ModelInfo, cost_key: str, usage_value: Optional[float]
+) -> float:
+    """
+    Generic cost calculator for any usage component
+
+    Args:
+        model_info: Dictionary containing cost information
+        cost_key: The key for the cost multiplier in model_info (e.g., 'input_cost_per_audio_token')
+        usage_value: The actual usage value (e.g., number of tokens, characters, seconds)
+
+    Returns:
+        float: The calculated cost
+    """
+    cost_per_unit = model_info.get(cost_key)
+    if (
+        cost_per_unit is not None
+        and isinstance(cost_per_unit, float)
+        and usage_value is not None
+        and usage_value > 0
+    ):
+        return float(usage_value) * cost_per_unit
+    return 0.0
+
+
 def generic_cost_per_token(
    model: str, usage: Usage, custom_llm_provider: str
 ) -> Tuple[float, float]:
@ -136,6 +161,7 @@ def generic_cost_per_token(
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
+
    ## GET MODEL INFO
    model_info = get_model_info(model=model, custom_llm_provider=custom_llm_provider)

@ -143,38 +169,124 @@ def generic_cost_per_token(
    ### Cost of processing (non-cache hit + cache hit) + Cost of cache-writing (cache writing)
    prompt_cost = 0.0
    ### PROCESSING COST
-    non_cache_hit_tokens = usage.prompt_tokens
+    text_tokens = usage.prompt_tokens
    cache_hit_tokens = 0
-    if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
-        cache_hit_tokens = usage.prompt_tokens_details.cached_tokens
-        non_cache_hit_tokens = non_cache_hit_tokens - cache_hit_tokens
+    audio_tokens = 0
+    character_count = 0
+    image_count = 0
+    video_length_seconds = 0
+    if usage.prompt_tokens_details:
+        cache_hit_tokens = (
+            cast(
+                Optional[int], getattr(usage.prompt_tokens_details, "cached_tokens", 0)
+            )
+            or 0
+        )
+        text_tokens = (
+            cast(
+                Optional[int], getattr(usage.prompt_tokens_details, "text_tokens", None)
+            )
+            or 0  # default to prompt tokens, if this field is not set
+        )
+        audio_tokens = (
+            cast(Optional[int], getattr(usage.prompt_tokens_details, "audio_tokens", 0))
+            or 0
+        )
+        character_count = (
+            cast(
+                Optional[int],
+                getattr(usage.prompt_tokens_details, "character_count", 0),
+            )
+            or 0
+        )
+        image_count = (
+            cast(Optional[int], getattr(usage.prompt_tokens_details, "image_count", 0))
+            or 0
+        )
+        video_length_seconds = (
+            cast(
+                Optional[int],
+                getattr(usage.prompt_tokens_details, "video_length_seconds", 0),
+            )
+            or 0
+        )
+
+    ## EDGE CASE - text tokens not set inside PromptTokensDetails
+    if text_tokens == 0:
+        text_tokens = usage.prompt_tokens - cache_hit_tokens - audio_tokens

    prompt_base_cost = _get_prompt_token_base_cost(model_info=model_info, usage=usage)

-    prompt_cost = float(non_cache_hit_tokens) * prompt_base_cost
+    prompt_cost = float(text_tokens) * prompt_base_cost

-    _cache_read_input_token_cost = model_info.get("cache_read_input_token_cost")
-    if (
-        _cache_read_input_token_cost is not None
-        and usage.prompt_tokens_details
-        and usage.prompt_tokens_details.cached_tokens
-    ):
-        prompt_cost += (
-            float(usage.prompt_tokens_details.cached_tokens)
-            * _cache_read_input_token_cost
+    ### CACHE READ COST
+    prompt_cost += calculate_cost_component(
+        model_info, "cache_read_input_token_cost", cache_hit_tokens
+    )
+
+    ### AUDIO COST
+    prompt_cost += calculate_cost_component(
+        model_info, "input_cost_per_audio_token", audio_tokens
    )

    ### CACHE WRITING COST
-    _cache_creation_input_token_cost = model_info.get("cache_creation_input_token_cost")
-    if _cache_creation_input_token_cost is not None:
-        prompt_cost += (
-            float(usage._cache_creation_input_tokens) * _cache_creation_input_token_cost
+    prompt_cost += calculate_cost_component(
+        model_info,
+        "cache_creation_input_token_cost",
+        usage._cache_creation_input_tokens,
+    )
+
+    ### CHARACTER COST
+
+    prompt_cost += calculate_cost_component(
+        model_info, "input_cost_per_character", character_count
+    )
+
+    ### IMAGE COUNT COST
+    prompt_cost += calculate_cost_component(
+        model_info, "input_cost_per_image", image_count
+    )
+
+    ### VIDEO LENGTH COST
+    prompt_cost += calculate_cost_component(
+        model_info, "input_cost_per_video_per_second", video_length_seconds
    )

    ## CALCULATE OUTPUT COST
    completion_base_cost = _get_completion_token_base_cost(
        model_info=model_info, usage=usage
    )
-    completion_cost = usage["completion_tokens"] * completion_base_cost
+    text_tokens = usage.completion_tokens
+    audio_tokens = 0
+    if usage.completion_tokens_details is not None:
+        audio_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.completion_tokens_details, "audio_tokens", 0),
+            )
+            or 0
+        )
+        text_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.completion_tokens_details, "text_tokens", None),
+            )
+            or usage.completion_tokens  # default to completion tokens, if this field is not set
+        )
+
+    ## TEXT COST
+    completion_cost = float(text_tokens) * completion_base_cost
+
+    _output_cost_per_audio_token: Optional[float] = model_info.get(
+        "output_cost_per_audio_token"
+    )
+
+    ## AUDIO COST
+    if (
+        _output_cost_per_audio_token is not None
+        and audio_tokens is not None
+        and audio_tokens > 0
+    ):
+        completion_cost += float(audio_tokens) * _output_cost_per_audio_token

    return prompt_cost, completion_cost
--- a/Show more
+++ b/Show more