Merge branch 'stevefarthing/bing-search-pass-thru' of github.com:sfarthin/litellm into stevefarthing/bing-search-pass-thru

# Conflicts: # litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
2025-04-27 11:43:54 +00:00 · 2025-03-11 08:15:23 -04:00 · 2025-03-11 08:15:23 -04:00 · 198f1765bb
commit 198f1765bb
parent 227ae4d8b3 b79b126597
640 changed files with 45106 additions and 12331 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,6 +1,8 @@
 version: 2.1
 orbs:
  codecov: codecov/codecov@4.0.1
+  node: circleci/node@5.1.0  # Add this line to declare the node orb
+

 jobs:
  local_testing:
@ -70,6 +72,7 @@ jobs:
            pip install "jsonschema==4.22.0"
            pip install "pytest-xdist==3.6.1"
            pip install "websockets==10.4"
+            pip uninstall posthog -y
      - save_cache:
          paths:
            - ./venv
@ -415,6 +418,56 @@ jobs:
          paths:
            - litellm_router_coverage.xml
            - litellm_router_coverage
+  litellm_proxy_security_tests:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+    steps:
+      - checkout
+      - run:
+          name: Show git commit hash
+          command: |
+            echo "Git commit hash: $CIRCLE_SHA1"
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "pytest-cov==5.0.0"
+      - run:
+          name: Run prisma ./docker/entrypoint.sh
+          command: |
+            set +e
+            chmod +x docker/entrypoint.sh
+            ./docker/entrypoint.sh
+            set -e
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest tests/proxy_security_tests --cov=litellm --cov-report=xml -vv -x -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml litellm_proxy_security_tests_coverage.xml
+            mv .coverage litellm_proxy_security_tests_coverage
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - litellm_proxy_security_tests_coverage.xml
+            - litellm_proxy_security_tests_coverage
  litellm_proxy_unit_testing: # Runs all tests with the "proxy", "key", "jwt" filenames
    docker:
      - image: cimg/python:3.11
@ -625,6 +678,50 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
+  litellm_mapped_tests:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+
+    steps:
+      - checkout
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest-mock==3.12.0"
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+            pip install "hypercorn==0.17.3"
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/litellm --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml litellm_mapped_tests_coverage.xml
+            mv .coverage litellm_mapped_tests_coverage
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - litellm_mapped_tests_coverage.xml
+            - litellm_mapped_tests_coverage
  batches_testing:
    docker:
      - image: cimg/python:3.11
@ -993,6 +1090,7 @@ jobs:
      - run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
      - run: ruff check ./litellm
      # - run: python ./tests/documentation_tests/test_general_setting_keys.py
+      - run: python ./tests/code_coverage_tests/check_licenses.py
      - run: python ./tests/code_coverage_tests/router_code_coverage.py
      - run: python ./tests/code_coverage_tests/callback_manager_test.py
      - run: python ./tests/code_coverage_tests/recursive_detector.py
@ -1005,6 +1103,7 @@ jobs:
      - run: python ./tests/code_coverage_tests/ensure_async_clients_test.py
      - run: python ./tests/code_coverage_tests/enforce_llms_folder_style.py
      - run: python ./tests/documentation_tests/test_circular_imports.py
+      - run: python ./tests/code_coverage_tests/prevent_key_leaks_in_exceptions.py
      - run: helm lint ./deploy/charts/litellm-helm

  db_migration_disable_update_check:
@ -1014,6 +1113,23 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
      - run:
          name: Build Docker image
          command: |
@ -1021,29 +1137,48 @@ jobs:
      - run:
          name: Run Docker container
          command: |
-            docker run --name my-app \
+            docker run -d \
              -p 4000:4000 \
              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e DISABLE_SCHEMA_UPDATE="True" \
              -v $(pwd)/litellm/proxy/example_config_yaml/bad_schema.prisma:/app/schema.prisma \
              -v $(pwd)/litellm/proxy/example_config_yaml/bad_schema.prisma:/app/litellm/proxy/schema.prisma \
              -v $(pwd)/litellm/proxy/example_config_yaml/disable_schema_update.yaml:/app/config.yaml \
+              --name my-app \
              myapp:latest \
              --config /app/config.yaml \
-              --port 4000 > docker_output.log 2>&1 || true
+              --port 4000
      - run:
-          name: Display Docker logs
-          command: cat docker_output.log
-      - run:
-          name: Check for expected error
+          name: Install curl and dockerize
          command: |
-            if grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two" docker_output.log; then
-              echo "Expected error found. Test passed."
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+
+      - run:
+          name: Wait for container to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 1m
+      - run:
+          name: Check container logs for expected message
+          command: |
+            echo "=== Printing Full Container Startup Logs ==="
+            docker logs my-app
+            echo "=== End of Full Container Startup Logs ==="
+            
+            if docker logs my-app 2>&1 | grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two"; then
+              echo "Expected message found in logs. Test passed."
            else
-              echo "Expected error not found. Test failed."
-              cat docker_output.log
+              echo "Expected message not found in logs. Test failed."
              exit 1
            fi
+      - run:
+          name: Run Basic Proxy Startup Tests (Health Readiness and Chat Completion)
+          command: |
+            python -m pytest -vv tests/basic_proxy_startup_tests -x --junitxml=test-results/junit-2.xml --durations=5
+          no_output_timeout: 120m
+

  build_and_test:
    machine:
@ -1464,6 +1599,199 @@ jobs:
      # Store test results
      - store_test_results:
          path: test-results
+
+  proxy_multi_instance_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-mock==3.12.0"
+            pip install "pytest-asyncio==0.21.1"
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
+      - run:
+          name: Run Docker container 1
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Run Docker container 2
+          command: |
+            docker run -d \
+              -p 4001:4001 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              --name my-app-2 \
+              -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4001 \
+              --detailed_debug
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for instance 1 to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Wait for instance 2 to be ready
+          command: dockerize -wait http://localhost:4001 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/multi_instance_e2e_tests -x --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout:
+            120m
+            # Clean up first container
+      # Store test results
+      - store_test_results:
+          path: test-results
+
+  proxy_store_model_in_db_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-mock==3.12.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "assemblyai==0.37.0"
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
+      - run:
+          name: Run Docker container
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e STORE_MODEL_IN_DB="True" \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/store_model_db_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for app to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/store_model_in_db_tests -x --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout:
+            120m
+            # Clean up first container
+      
  proxy_build_from_pip_tests:
    # Change from docker to machine executor
    machine:
@ -1607,12 +1935,12 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install anthropic
+            pip install "anthropic==0.49.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1654,11 +1982,44 @@ jobs:
      - run:
          name: Wait for app to be ready
          command: dockerize -wait http://localhost:4000 -timeout 5m
+      # Add Ruby installation and testing before the existing Node.js and Python tests
+      - run:
+          name: Install Ruby and Bundler
+          command: |
+            # Import GPG keys first
+            gpg --keyserver hkp://keyserver.ubuntu.com --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3 7D2BAF1CF37B13E2069D6956105BD0E739499BDB || {
+              curl -sSL https://rvm.io/mpapis.asc | gpg --import -
+              curl -sSL https://rvm.io/pkuczynski.asc | gpg --import -
+            }
+            
+            # Install Ruby version manager (RVM)
+            curl -sSL https://get.rvm.io | bash -s stable
+            
+            # Source RVM from the correct location
+            source $HOME/.rvm/scripts/rvm
+            
+            # Install Ruby 3.2.2
+            rvm install 3.2.2
+            rvm use 3.2.2 --default
+            
+            # Install latest Bundler
+            gem install bundler
+
+      - run:
+          name: Run Ruby tests
+          command: |
+            source $HOME/.rvm/scripts/rvm
+            cd tests/pass_through_tests/ruby_passthrough_tests
+            bundle install
+            bundle exec rspec
+          no_output_timeout: 30m
      # New steps to run Node.js test
      - run:
          name: Install Node.js
          command: |
+            export DEBIAN_FRONTEND=noninteractive
            curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
+            sudo apt-get update
            sudo apt-get install -y nodejs
            node --version
            npm --version
@ -1707,7 +2068,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage
+            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -1771,7 +2132,7 @@ jobs:
                circleci step halt
            fi
      - run:
-          name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
+          name: Trigger Github Action for new Docker Container + Trigger Load Testing
          command: |
            echo "Install TOML package."
            python3 -m pip install toml
@ -1781,9 +2142,9 @@ jobs:
              -H "Accept: application/vnd.github.v3+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
-              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
-            echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
-            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
+              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}-nightly\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
+            echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}"
+            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly"

  e2e_ui_testing:
    machine:
@ -1792,6 +2153,25 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - run:
+          name: Build UI
+          command: |
+            # Set up nvm
+            export NVM_DIR="/opt/circleci/.nvm"
+            source "$NVM_DIR/nvm.sh"
+            source "$NVM_DIR/bash_completion"
+            
+            # Install and use Node version
+            nvm install v18.17.0
+            nvm use v18.17.0
+            
+            cd ui/litellm-dashboard
+            
+            # Install dependencies first
+            npm install
+            
+            # Now source the build script
+            source ./build_ui.sh
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1836,6 +2216,7 @@ jobs:
          name: Install Playwright Browsers
          command: |
            npx playwright install
+      
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1964,6 +2345,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - litellm_proxy_security_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - litellm_assistants_api_testing:
          filters:
            branches:
@ -2012,6 +2399,18 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - proxy_multi_instance_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
+      - proxy_store_model_in_db_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - proxy_build_from_pip_tests:
          filters:
            branches:
@ -2030,6 +2429,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - litellm_mapped_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - batches_testing:
          filters:
            branches:
@ -2063,6 +2468,7 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
+            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
            - pass_through_unit_testing
@ -2071,6 +2477,7 @@ workflows:
            - litellm_router_testing
            - caching_unit_tests
            - litellm_proxy_unit_testing
+            - litellm_proxy_security_tests
            - langfuse_logging_unit_tests
            - local_testing
            - litellm_assistants_api_testing
@ -2119,6 +2526,7 @@ workflows:
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
+            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
            - pass_through_unit_testing
@ -2132,9 +2540,12 @@ workflows:
            - db_migration_disable_update_check
            - e2e_ui_testing
            - litellm_proxy_unit_testing
+            - litellm_proxy_security_tests
            - installing_litellm_on_python
            - installing_litellm_on_python_3_13
            - proxy_logging_guardrails_model_info_tests
+            - proxy_multi_instance_tests
+            - proxy_store_model_in_db_tests
            - proxy_build_from_pip_tests
            - proxy_pass_through_endpoint_tests
            - check_code_and_doc_quality
--- a/.env.example
+++ b/.env.example
@ -20,3 +20,8 @@ REPLICATE_API_TOKEN = ""
 ANTHROPIC_API_KEY = ""
 # Infisical
 INFISICAL_TOKEN = ""
+
+# Development Configs
+LITELLM_MASTER_KEY = "sk-1234"
+DATABASE_URL = "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
+STORE_MODEL_IN_DB = "True"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -6,6 +6,16 @@

 <!-- e.g. "Fixes #000" -->

+## Pre-Submission checklist
+
+**Please complete all items before asking a LiteLLM maintainer to review your PR**
+
+- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
+- [ ] I have added a screenshot of my new test passing locally 
+- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
+- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
+
+
 ## Type

 <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@

 ## Changes

-<!-- List of changes -->
-
-## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
-If UI changes, send a screenshot/GIF of working UI fixes
-
-<!-- Test procedure -->

--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -52,6 +52,41 @@ def interpret_results(csv_file):
    return markdown_table


+def _get_docker_run_command_stable_release(release_version):
+    return f"""
+\n\n
+## Docker Run LiteLLM Proxy
+
+```
+docker run \\
+-e STORE_MODEL_IN_DB=True \\
+-p 4000:4000 \\
+ghcr.io/berriai/litellm:litellm_stable_release_branch-{release_version}
+```
+    """
+
+
+def _get_docker_run_command(release_version):
+    return f"""
+\n\n
+## Docker Run LiteLLM Proxy
+
+```
+docker run \\
+-e STORE_MODEL_IN_DB=True \\
+-p 4000:4000 \\
+ghcr.io/berriai/litellm:main-{release_version}
+```
+    """
+
+
+def get_docker_run_command(release_version):
+    if "stable" in release_version:
+        return _get_docker_run_command_stable_release(release_version)
+    else:
+        return _get_docker_run_command(release_version)
+
+
 if __name__ == "__main__":
    csv_file = "load_test_stats.csv"  # Change this to the path of your CSV file
    markdown_table = interpret_results(csv_file)
@ -79,17 +114,7 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]

-    docker_run_command = f"""
-\n\n
-## Docker Run LiteLLM Proxy
-
-```
-docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
-ghcr.io/berriai/litellm:main-{release_version}
-```
-    """
+    docker_run_command = get_docker_run_command(release_version)
    print("docker run command: ", docker_run_command)

    new_release_body = (
--- a/.github/workflows/locustfile.py
+++ b/.github/workflows/locustfile.py
@ -8,7 +8,7 @@ class MyUser(HttpUser):
    def chat_completion(self):
        headers = {
            "Content-Type": "application/json",
-            "Authorization": "Bearer sk-ZoHqrLIs2-5PzJrqBaviAA",
+            "Authorization": "Bearer sk-8N1tLOOyH8TIxwOLahhIVg",
            # Include any additional headers you may need for authentication, etc.
        }

--- a/.gitignore
+++ b/.gitignore
@ -75,3 +75,7 @@ litellm/proxy/custom_guardrail.py
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/model_hub.html
+.mypy_cache/*
+litellm/proxy/application.log
+tests/llm_translation/vertex_test_account.json
+tests/llm_translation/test_vertex_key.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -22,7 +22,7 @@ repos:
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
-       exclude: ^litellm/tests/|^litellm/proxy/tests/
+       exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
    # -  id: flake8
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+# LiteLLM Makefile
+# Simple Makefile for running tests and basic development tasks
+
+.PHONY: help test test-unit test-integration
+
+# Default target
+help:
+	@echo "Available commands:"
+	@echo "  make test               - Run all tests"
+	@echo "  make test-unit          - Run unit tests"
+	@echo "  make test-integration   - Run integration tests"
+
+# Testing
+test:
+	poetry run pytest tests/
+
+test-unit:
+	poetry run pytest tests/litellm/
+
+test-integration:
+	poetry run pytest tests/ -k "not litellm" 
--- a/README.md
+++ b/README.md
@ -40,7 +40,7 @@ LiteLLM manages:
 [**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)

-🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 
+🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)

 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

@ -64,7 +64,7 @@ import os

 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "your-openai-key"
-os.environ["ANTHROPIC_API_KEY"] = "your-cohere-key"
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key"

 messages = [{ "content": "Hello, how are you?","role": "user"}]

@ -187,13 +187,13 @@ os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["ATHINA_API_KEY"] = "your-athina-api-key"

-os.environ["OPENAI_API_KEY"]
+os.environ["OPENAI_API_KEY"] = "your-openai-key"

 # set callbacks
 litellm.success_callback = ["lunary", "mlflow", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc

 #openai call
-response = completion(model="anthropic/claude-3-sonnet-20240229", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
+response = completion(model="openai/gpt-4o", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```

 # LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
@ -303,6 +303,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 |-------------------------------------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------|
 | [openai](https://docs.litellm.ai/docs/providers/openai)                             | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
 | [azure](https://docs.litellm.ai/docs/providers/azure)                               | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
+| [AI/ML API](https://docs.litellm.ai/docs/providers/aiml)                               | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
 | [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)             | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             |                                                                         |
 | [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)                     | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             |                                                                         |
 | [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex)                 | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
@ -339,64 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \

 ## Contributing

-To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
-
-Here's how to modify the repo locally:
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Navigate into the project, and install dependencies:
-
-```
-cd litellm
-poetry install -E extra_proxy -E proxy
-```
-
-Step 3: Test your change:
-
-```
-cd tests # pwd: Documents/litellm/litellm/tests
-poetry run flake8
-poetry run pytest .
-```
-
-Step 4: Submit a PR with your changes! 🚀
-
- push your fork to your GitHub repo
- submit a PR from there
-
-### Building LiteLLM Docker Image 
-
-Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Build the Docker Image
-
-Build using Dockerfile.non_root
-```
-docker build -f docker/Dockerfile.non_root -t litellm_test_image .
-```
-
-Step 3: Run the Docker Image
-
-Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
-```
-docker run \
-    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
-    -e DATABASE_URL="postgresql://xxxxxxxx" \
-    -e LITELLM_MASTER_KEY="sk-1234" \
-    -p 4000:4000 \
-    litellm_test_image \
-    --config /app/config.yaml --detailed_debug
-```
+Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)

 # Enterprise
 For companies that need better security, user management and professional support
@ -450,3 +394,20 @@ If you have suggestions on how to improve the code quality feel free to open an
 <a href="https://github.com/BerriAI/litellm/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=BerriAI/litellm" />
 </a>
+
+
+## Run in Developer mode
+### Services
+1. Setup .env file in root
+2. Run dependant services `docker-compose up db prometheus`
+
+### Backend
+1. (In root) create virtual environment `python -m venv .venv`
+2. Activate virtual environment `source .venv/bin/activate`
+3. Install dependencies `pip install -e ".[all]"`
+4. Start proxy backend `uvicorn litellm.proxy.proxy_server:app --host localhost --port 4000 --reload`
+
+### Frontend
+1. Navigate to `ui/litellm-dashboard`
+2. Install dependencies `npm install`
+3. Run `npm run dev` to start the dashboard
--- a/cookbook/logging_observability/LiteLLM_Arize.ipynb
+++ b/cookbook/logging_observability/LiteLLM_Arize.ipynb
@ -0,0 +1,172 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4FbDOmcj2VkM"
+      },
+      "source": [
+        "## Use LiteLLM with Arize\n",
+        "https://docs.litellm.ai/docs/observability/arize_integration\n",
+        "\n",
+        "This method uses the litellm proxy to send the data to Arize. The callback is set in the litellm config below, instead of using OpenInference tracing."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "21W8Woog26Ns"
+      },
+      "source": [
+        "## Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "xrjKLBxhxu2L"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: litellm in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (1.54.1)\n",
+            "Requirement already satisfied: aiohttp in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.11.10)\n",
+            "Requirement already satisfied: click in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.1.7)\n",
+            "Requirement already satisfied: httpx<0.28.0,>=0.23.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.27.2)\n",
+            "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.5.0)\n",
+            "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.1.4)\n",
+            "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (4.23.0)\n",
+            "Requirement already satisfied: openai>=1.55.3 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.57.1)\n",
+            "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.10.3)\n",
+            "Requirement already satisfied: python-dotenv>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.0.1)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.32.3)\n",
+            "Requirement already satisfied: tiktoken>=0.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.7.0)\n",
+            "Requirement already satisfied: tokenizers in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.21.0)\n",
+            "Requirement already satisfied: anyio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (4.7.0)\n",
+            "Requirement already satisfied: certifi in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (2024.8.30)\n",
+            "Requirement already satisfied: httpcore==1.* in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.0.7)\n",
+            "Requirement already satisfied: idna in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (3.10)\n",
+            "Requirement already satisfied: sniffio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.3.1)\n",
+            "Requirement already satisfied: h11<0.15,>=0.13 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.23.0->litellm) (0.14.0)\n",
+            "Requirement already satisfied: zipp>=3.20 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm) (3.21.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm) (3.0.2)\n",
+            "Requirement already satisfied: attrs>=22.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (24.2.0)\n",
+            "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (2024.10.1)\n",
+            "Requirement already satisfied: referencing>=0.28.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.35.1)\n",
+            "Requirement already satisfied: rpds-py>=0.7.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.22.3)\n",
+            "Requirement already satisfied: distro<2,>=1.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (1.9.0)\n",
+            "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (0.6.1)\n",
+            "Requirement already satisfied: tqdm>4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.67.1)\n",
+            "Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.12.2)\n",
+            "Requirement already satisfied: annotated-types>=0.6.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.27.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (2.27.1)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (3.4.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (2.0.7)\n",
+            "Requirement already satisfied: regex>=2022.1.18 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm) (2024.11.6)\n",
+            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (2.4.4)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.3.1)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.5.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (6.1.0)\n",
+            "Requirement already satisfied: propcache>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (0.2.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.18.3)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tokenizers->litellm) (0.26.5)\n",
+            "Requirement already satisfied: filelock in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (3.16.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (2024.10.0)\n",
+            "Requirement already satisfied: packaging>=20.9 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (24.2)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (6.0.2)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install litellm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jHEu-TjZ29PJ"
+      },
+      "source": [
+        "## Set Env Variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "QWd9rTysxsWO"
+      },
+      "outputs": [],
+      "source": [
+        "import litellm\n",
+        "import os\n",
+        "from getpass import getpass\n",
+        "\n",
+        "os.environ[\"ARIZE_SPACE_KEY\"] = getpass(\"Enter your Arize space key: \")\n",
+        "os.environ[\"ARIZE_API_KEY\"] = getpass(\"Enter your Arize API key: \")\n",
+        "os.environ['OPENAI_API_KEY']= getpass(\"Enter your OpenAI API key: \")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's run a completion call and see the traces in Arize"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Hello! Nice to meet you, OpenAI. How can I assist you today?\n"
+          ]
+        }
+      ],
+      "source": [
+        "# set arize as a callback, litellm will send the data to arize\n",
+        "litellm.callbacks = [\"arize\"]\n",
+        " \n",
+        "# openai call\n",
+        "response = litellm.completion(\n",
+        "  model=\"gpt-3.5-turbo\",\n",
+        "  messages=[\n",
+        "    {\"role\": \"user\", \"content\": \"Hi 👋 - i'm openai\"}\n",
+        "  ]\n",
+        ")\n",
+        "print(response.choices[0].message.content)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/logging_observability/LiteLLM_Proxy_Langfuse.ipynb
+++ b/cookbook/logging_observability/LiteLLM_Proxy_Langfuse.ipynb
@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM Ops Stack  - LiteLLM Proxy + Langfuse \n",
+    "\n",
+    "This notebook demonstrates how to use LiteLLM Proxy with Langfuse \n",
+    "- Use LiteLLM Proxy for calling 100+ LLMs in OpenAI format\n",
+    "- Use Langfuse for viewing request / response traces \n",
+    "\n",
+    "\n",
+    "In this notebook we will setup LiteLLM Proxy to make requests to OpenAI, Anthropic, Bedrock and automatically log traces to Langfuse."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup LiteLLM Proxy\n",
+    "\n",
+    "### 1.1 Define .env variables \n",
+    "Define .env variables on the container that litellm proxy is running on.\n",
+    "```bash\n",
+    "## LLM API Keys\n",
+    "OPENAI_API_KEY=sk-proj-1234567890\n",
+    "ANTHROPIC_API_KEY=sk-ant-api03-1234567890\n",
+    "AWS_ACCESS_KEY_ID=1234567890\n",
+    "AWS_SECRET_ACCESS_KEY=1234567890\n",
+    "\n",
+    "## Langfuse Logging \n",
+    "LANGFUSE_PUBLIC_KEY=\"pk-lf-xxxx9\"\n",
+    "LANGFUSE_SECRET_KEY=\"sk-lf-xxxx9\"\n",
+    "LANGFUSE_HOST=\"https://us.cloud.langfuse.com\"\n",
+    "```\n",
+    "\n",
+    "\n",
+    "### 1.1 Setup LiteLLM Proxy Config yaml \n",
+    "```yaml\n",
+    "model_list:\n",
+    "  - model_name: gpt-4o\n",
+    "    litellm_params:\n",
+    "      model: openai/gpt-4o\n",
+    "      api_key: os.environ/OPENAI_API_KEY\n",
+    "  - model_name: claude-3-5-sonnet-20241022\n",
+    "    litellm_params:\n",
+    "      model: anthropic/claude-3-5-sonnet-20241022\n",
+    "      api_key: os.environ/ANTHROPIC_API_KEY\n",
+    "  - model_name: us.amazon.nova-micro-v1:0\n",
+    "    litellm_params:\n",
+    "      model: bedrock/us.amazon.nova-micro-v1:0\n",
+    "      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID\n",
+    "      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY\n",
+    "\n",
+    "litellm_settings:\n",
+    "  callbacks: [\"langfuse\"]\n",
+    "\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Make LLM Requests to LiteLLM Proxy\n",
+    "\n",
+    "Now we will make our first LLM request to LiteLLM Proxy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Setup Client Side Variables to point to LiteLLM Proxy\n",
+    "Set `LITELLM_PROXY_BASE_URL` to the base url of the LiteLLM Proxy and `LITELLM_VIRTUAL_KEY` to the virtual key you want to use for Authentication to LiteLLM Proxy. (Note: In this initial setup you can)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "LITELLM_PROXY_BASE_URL=\"http://0.0.0.0:4000\"\n",
+    "LITELLM_VIRTUAL_KEY=\"sk-oXXRa1xxxxxxxxxxx\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-B0sq6QkOKNMJ0dwP3x7OoMqk1jZcI', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Langfuse is a platform designed to monitor, observe, and troubleshoot AI and large language model (LLM) applications. It provides features that help developers gain insights into how their AI systems are performing, make debugging easier, and optimize the deployment of models. Langfuse allows for tracking of model interactions, collecting telemetry, and visualizing data, which is crucial for understanding the behavior of AI models in production environments. This kind of tool is particularly useful for developers working with language models who need to ensure reliability and efficiency in their applications.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739550502, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_523b9b6e5f', usage=CompletionUsage(completion_tokens=109, prompt_tokens=13, total_tokens=122, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 View Traces on Langfuse\n",
+    "LiteLLM will send the request / response, model, tokens (input + output), cost to Langfuse.\n",
+    "\n",
+    "![image_description](litellm_proxy_langfuse.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Call Anthropic, Bedrock models \n",
+    "\n",
+    "Now we can call `us.amazon.nova-micro-v1:0` and `claude-3-5-sonnet-20241022` models defined on your config.yaml both in the OpenAI request / response format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-7756e509-e61f-4f5e-b5ae-b7a41013522a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability tool designed specifically for machine learning models and applications built with natural language processing (NLP) and large language models (LLMs). It focuses on providing detailed insights into how these models perform in real-world scenarios. Here are some key features and purposes of Langfuse:\\n\\n1. **Real-time Monitoring**: Langfuse allows developers to monitor the performance of their NLP and LLM applications in real time. This includes tracking the inputs and outputs of the models, as well as any errors or issues that arise during operation.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the models' outputs. By analyzing incorrect or unexpected responses, developers can pinpoint where and why errors occur, facilitating more effective debugging and improvement.\\n\\n3. **Performance Metrics**: Langfuse provides various performance metrics, such as latency, throughput, and error rates. These metrics help developers understand how well their models are performing under different conditions and workloads.\\n\\n4. **Traceability**: It offers detailed traceability of requests and responses, allowing developers to follow the path of a request through the system and see how it is processed by the model at each step.\\n\\n5. **User Feedback Integration**: Langfuse can integrate user feedback to provide context for model outputs. This helps in understanding how real users are interacting with the model and how its outputs align with user expectations.\\n\\n6. **Customizable Dashboards**: Users can create custom dashboards to visualize the data collected by Langfuse. These dashboards can be tailored to highlight the most important metrics and insights for a specific application or team.\\n\\n7. **Alerting and Notifications**: It can set up alerts for specific conditions or errors, notifying developers when something goes wrong or when performance metrics fall outside of acceptable ranges.\\n\\nBy providing comprehensive observability for NLP and LLM applications, Langfuse helps developers to build more reliable, accurate, and user-friendly models and services.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554005, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=380, prompt_tokens=5, total_tokens=385, completion_tokens_details=None, prompt_tokens_details=None))"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"us.amazon.nova-micro-v1:0\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Advanced - Set Langfuse Trace ID, Tags, Metadata \n",
+    "\n",
+    "Here is an example of how you can set Langfuse specific params on your client side request. See full list of supported langfuse params [here](https://docs.litellm.ai/docs/observability/langfuse_integration)\n",
+    "\n",
+    "You can view the logged trace of this request [here](https://us.cloud.langfuse.com/project/clvlhdfat0007vwb74m9lvfvi/traces/567890?timestamp=2025-02-14T17%3A30%3A26.709Z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-789babd5-c064-4939-9093-46e4cd2e208a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability platform designed specifically for monitoring and improving the performance of natural language processing (NLP) models and applications. It provides developers with tools to track, analyze, and optimize how their language models interact with users and handle natural language inputs.\\n\\nHere are some key features and benefits of Langfuse:\\n\\n1. **Real-Time Monitoring**: Langfuse allows developers to monitor their NLP applications in real time. This includes tracking user interactions, model responses, and overall performance metrics.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the model's responses. This can include incorrect, irrelevant, or unsafe outputs.\\n\\n3. **User Feedback Integration**: Langfuse enables the collection of user feedback directly within the platform. This feedback can be used to identify areas for improvement in the model's performance.\\n\\n4. **Performance Metrics**: The platform provides detailed metrics and analytics on model performance, including latency, throughput, and accuracy.\\n\\n5. **Alerts and Notifications**: Developers can set up alerts to notify them of any significant issues or anomalies in model performance.\\n\\n6. **Debugging Tools**: Langfuse offers tools to help developers debug and refine their models by providing insights into how the model processes different types of inputs.\\n\\n7. **Integration with Development Workflows**: It integrates seamlessly with various development environments and CI/CD pipelines, making it easier to incorporate observability into the development process.\\n\\n8. **Customizable Dashboards**: Users can create custom dashboards to visualize the data in a way that best suits their needs.\\n\\nLangfuse aims to help developers build more reliable, accurate, and user-friendly NLP applications by providing them with the tools to observe and improve how their models perform in real-world scenarios.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554281, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=346, prompt_tokens=5, total_tokens=351, completion_tokens_details=None, prompt_tokens_details=None))"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"us.amazon.nova-micro-v1:0\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    "    extra_body={\n",
+    "        \"metadata\": {\n",
+    "            \"generation_id\": \"1234567890\",\n",
+    "            \"trace_id\": \"567890\",\n",
+    "            \"trace_user_id\": \"user_1234567890\",\n",
+    "            \"tags\": [\"tag1\", \"tag2\"]\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## "
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/cookbook/logging_observability/litellm_proxy_langfuse.png
+++ b/cookbook/logging_observability/litellm_proxy_langfuse.png
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.3.0
+version: 0.4.1

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/deploy/charts/litellm-helm/templates/migrations-job.yaml
+++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml
@ -48,6 +48,23 @@ spec:
            {{- end }}
            - name: DISABLE_SCHEMA_UPDATE
              value: "false" # always run the migration from the Helm PreSync hook, override the value set
+          {{- with .Values.volumeMounts }}
+          volumeMounts:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      {{- with .Values.volumes }}
+      volumes:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
      restartPolicy: OnFailure
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }}
  backoffLimit: {{ .Values.migrationJob.backoffLimit }}
 {{- end }}
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -187,6 +187,7 @@ migrationJob:
  backoffLimit: 4 # Backoff limit for Job restarts
  disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0.
  annotations: {}
+  ttlSecondsAfterFinished: 120

 # Additional environment variables to be added to the deployment
 envVars: {
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -29,6 +29,8 @@ services:
      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
+    ports:
+      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
--- a/docker/Dockerfile.alpine
+++ b/docker/Dockerfile.alpine
@ -11,9 +11,7 @@ FROM $LITELLM_BUILD_IMAGE AS builder
 WORKDIR /app

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev musl-dev && \
-    rm -rf /var/cache/apk/*
+RUN apk add --no-cache gcc python3-dev musl-dev

 RUN pip install --upgrade pip && \
    pip install build
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -0,0 +1,92 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [BETA] `/v1/messages`
+
+LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
+
+This currently just supports the Anthropic API. 
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ✅ |  |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | between anthropic models |
+| Loadbalancing | ✅ | between anthropic models |
+
+Planned improvement:
+- Vertex AI Anthropic support
+- Bedrock Anthropic support
+
+## Usage 
+
+<Tabs>
+<TabItem label="PROXY" value="proxy">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "List 5 important events in the XIX century"
+        }
+      ]
+    }
+  ],
+  "max_tokens": 4096
+}'
+```
+</TabItem>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
+import asyncio 
+import os 
+
+# set env 
+os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
+
+messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
+
+# Call the handler
+async def call(): 
+    response = await anthropic_messages(
+        messages=messages,
+        api_key=api_key,
+        model="claude-3-haiku-20240307",
+        max_tokens=100,
+    )
+
+asyncio.run(call())
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/completion/function_call.md
+++ b/docs/my-website/docs/completion/function_call.md
@ -8,6 +8,7 @@ Use `litellm.supports_function_calling(model="")` -> returns `True` if model sup
 assert litellm.supports_function_calling(model="gpt-3.5-turbo") == True
 assert litellm.supports_function_calling(model="azure/gpt-4-1106-preview") == True
 assert litellm.supports_function_calling(model="palm/chat-bison") == False
+assert litellm.supports_function_calling(model="xai/grok-2-latest") == True
 assert litellm.supports_function_calling(model="ollama/llama2") == False
 ```

--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -44,6 +44,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
 |Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |✅ | ✅ | | ✅ | ✅ |  |  | ✅ |
 |OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
 |Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ |  |  | ✅ |
+|xAI| ✅ |  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |
 |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |  |   |  |   |
 |Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Cohere| ✅ | ✅ | ✅ | ✅ |  ✅ | ✅ | ✅ | ✅ | ✅ |   |   |
--- a/docs/my-website/docs/completion/reliable_completions.md
+++ b/docs/my-website/docs/completion/reliable_completions.md
@ -46,7 +46,7 @@ from litellm import completion
 fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
 messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]

-completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict)
 ```

 ### Fallbacks - Switch Models/API Keys/API Bases (SDK)
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -190,3 +190,137 @@ Expected Response

 </TabItem>
 </Tabs>
+
+
+## Explicitly specify image type 
+
+If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
+
+```python
+"image_url": {
+  "url": "gs://my-gs-image",
+  "format": "image/jpeg"
+}
+```
+
+LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai). 
+
+For others (e.g. openai), it will be ignored. 
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+```python
+import os 
+from litellm import completion
+
+os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "claude-3-7-sonnet-latest", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                  "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+</TabItem>
+<TabItem label="PROXY" value="proxy">
+
+1. Define vision models on config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
+    litellm_params:
+      model: openai/gpt-4-vision-preview
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: llava-hf          # Custom OpenAI compatible model
+    litellm_params:
+      model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
+      api_base: http://localhost:8000
+      api_key: fake-key
+    model_info:
+      supports_vision: True        # set supports_vision to True so /model/info returns this attribute as True
+
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it using the OpenAI Python SDK
+
+
+```python
+import os 
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # your litellm proxy api key
+)
+
+response = client.chat.completions.create(
+    model = "gpt-4-vision-preview",  # use model="llava-hf" to test your custom OpenAI endpoint
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
+
+
+## Spec 
+
+```
+"image_url": str
+
+OR 
+
+"image_url": {
+  "url": "url OR base64 encoded str",
+  "detail": "openai-only param", 
+  "format": "specify mime-type of image"
+}
+```
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
 |-------------------|-------------------------------------------------------------------------------------------------|
 | SOC 2 Type I      | Certified. Report available upon request on Enterprise plan.                                                           |
 | SOC 2 Type II     | In progress. Certificate available by April 15th, 2025                   |
-| ISO27001          | In progress. Certificate available by February 7th, 2025                                           |
+| ISO 27001          | Certified. Report available upon request on Enterprise                              |


 ## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
 Has the Vendor been audited / certified? 
 - SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
 - SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025.
+- ISO 27001. Certified. Report available upon request on Enterprise plan.

 Has an information security management system been implemented? 
 - Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -1,5 +1,5 @@
 # Local Debugging
-There's 2 ways to do local debugging - `litellm.set_verbose=True` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `set_verbose` in production. It logs API keys, which might end up in log files.
+There's 2 ways to do local debugging - `litellm._turn_on_debug()` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `_turn_on_debug()` in production. It logs API keys, which might end up in log files.

 ## Set Verbose 

@ -8,7 +8,7 @@ This is good for getting print statements for everything litellm is doing.
 import litellm
 from litellm import completion

-litellm.set_verbose=True # 👈 this is the 1-line change you need to make
+litellm._turn_on_debug() # 👈 this is the 1-line change you need to make

 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "openai key"
--- a/docs/my-website/docs/extras/contributing_code.md
+++ b/docs/my-website/docs/extras/contributing_code.md
@ -0,0 +1,96 @@
+# Contributing Code
+
+## **Checklist before submitting a PR**
+
+Here are the core requirements for any PR submitted to LiteLLM
+
+
+- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
+- [ ] Ensure your PR passes the following tests:
+    - [ ] [Unit Tests](#3-running-unit-tests)
+    - [ ] Formatting / Linting Tests
+- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
+
+
+
+## Quick start
+
+## 1. Setup your local dev environment
+
+
+Here's how to modify the repo locally:
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Install dev dependencies:
+
+```shell
+poetry install --with dev --extras proxy
+```
+
+That's it, your local dev environment is ready!
+
+## 2. Adding Testing to your PR
+
+- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
+
+- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
+- Do not add real llm api calls to this directory.
+
+### 2.1 File Naming Convention for `tests/litellm/`
+
+The `tests/litellm/` directory follows the same directory structure as `litellm/`.
+
+- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
+- `test_{filename}.py` maps to `litellm/{filename}.py`
+
+## 3. Running Unit Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make test-unit
+```
+
+## 4. Submit a PR with your changes!
+
+- push your fork to your GitHub repo
+- submit a PR from there
+
+
+## Advanced
+### Building LiteLLM Docker Image 
+
+Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Build the Docker Image
+
+Build using Dockerfile.non_root
+
+```shell
+docker build -f docker/Dockerfile.non_root -t litellm_test_image .
+```
+
+Step 3: Run the Docker Image
+
+Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
+
+```shell
+docker run \
+    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
+    -e DATABASE_URL="postgresql://xxxxxxxx" \
+    -e LITELLM_MASTER_KEY="sk-1234" \
+    -p 4000:4000 \
+    litellm_test_image \
+    --config /app/config.yaml --detailed_debug
+```
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -19,6 +19,7 @@ Make an account on [Arize AI](https://app.arize.com/auth/login)
 ## Quick Start
 Use just 2 lines of code, to instantly log your responses **across all providers** with arize

+You can also use the instrumentor option instead of the callback, which you can find [here](https://docs.arize.com/arize/llm-tracing/tracing-integrations-auto/litellm).

 ```python
 litellm.callbacks = ["arize"]
@ -28,7 +29,7 @@ import litellm
 import os

 os.environ["ARIZE_SPACE_KEY"] = ""
-os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
+os.environ["ARIZE_API_KEY"] = ""

 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,7 +78,10 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
-
+* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
+* `user_feedback: Optional[str]` - The end user’s feedback.
+* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
+* `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.

 ## Using a self hosted deployment of Athina

--- a/docs/my-website/docs/observability/opik_integration.md
+++ b/docs/my-website/docs/observability/opik_integration.md
@ -1,3 +1,5 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
 import Image from '@theme/IdealImage';

 # Comet Opik - Logging + Evals
@ -21,17 +23,16 @@ Use just 4 lines of code, to instantly log your responses **across all providers
 Get your Opik API Key by signing up [here](https://www.comet.com/signup?utm_source=litelllm&utm_medium=docs&utm_content=api_key_cell)!

 ```python
-from litellm.integrations.opik.opik import OpikLogger
 import litellm
-
-opik_logger = OpikLogger()
-litellm.callbacks = [opik_logger]
+litellm.callbacks = ["opik"]
 ```

 Full examples:

+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
-from litellm.integrations.opik.opik import OpikLogger
 import litellm
 import os

@ -43,8 +44,7 @@ os.environ["OPIK_WORKSPACE"] = ""
 os.environ["OPENAI_API_KEY"] = ""

 # set "opik" as a callback, litellm will send the data to an Opik server (such as comet.com)
-opik_logger = OpikLogger()
-litellm.callbacks = [opik_logger]
+litellm.callbacks = ["opik"]

 # openai call
 response = litellm.completion(
@ -55,18 +55,16 @@ response = litellm.completion(
 )
 ```

-If you are liteLLM within a function tracked using Opik's `@track` decorator,
+If you are using liteLLM within a function tracked using Opik's `@track` decorator,
 you will need provide the `current_span_data` field in the metadata attribute
 so that the LLM call is assigned to the correct trace:

 ```python
 from opik import track
 from opik.opik_context import get_current_span_data
-from litellm.integrations.opik.opik import OpikLogger
 import litellm

-opik_logger = OpikLogger()
-litellm.callbacks = [opik_logger]
+litellm.callbacks = ["opik"]

@track()
 def streaming_function(input):
@ -87,6 +85,126 @@ response = streaming_function("Why is tracking and evaluation of LLMs important?
 chunks = list(response)
 ```

+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo-testing
+    litellm_params:
+      model: gpt-3.5-turbo
+      api_key: os.environ/OPENAI_API_KEY
+
+litellm_settings:
+  callbacks: ["opik"]
+
+environment_variables:
+  OPIK_API_KEY: ""
+  OPIK_WORKSPACE: ""
+```
+
+2. Run proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-3.5-turbo-testing",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What's the weather like in Boston today?"
+    }
+  ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+## Opik-Specific Parameters
+
+These can be passed inside metadata with the `opik` key.
+
+### Fields 
+
+- `project_name` - Name of the Opik project to send data to.
+- `current_span_data` - The current span data to be used for tracing.
+- `tags` - Tags to be used for tracing.
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from opik import track
+from opik.opik_context import get_current_span_data
+import litellm
+
+litellm.callbacks = ["opik"]
+
+messages = [{"role": "user", "content": input}]
+response = litellm.completion(
+    model="gpt-3.5-turbo",
+    messages=messages,
+    metadata = {
+        "opik": {
+            "current_span_data": get_current_span_data(),
+            "tags": ["streaming-test"],
+        },
+    }
+)
+return response
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-3.5-turbo-testing",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What's the weather like in Boston today?"
+    }
+  ],
+  "metadata": {
+    "opik": {
+      "current_span_data": "...",
+      "tags": ["streaming-test"],
+    },
+  }
+}'
+``` 
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+
+
+
+
+
+
+
+
 ## Support & Talk to Founders

 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
--- a/docs/my-website/docs/observability/phoenix_integration.md
+++ b/docs/my-website/docs/observability/phoenix_integration.md
@ -0,0 +1,75 @@
+import Image from '@theme/IdealImage';
+
+# Phoenix OSS
+
+Open source tracing and evaluation platform
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
+## Pre-Requisites
+Make an account on [Phoenix OSS](https://phoenix.arize.com)
+OR self-host your own instance of [Phoenix](https://docs.arize.com/phoenix/deployment)
+
+## Quick Start
+Use just 2 lines of code, to instantly log your responses **across all providers** with Phoenix
+
+You can also use the instrumentor option instead of the callback, which you can find [here](https://docs.arize.com/phoenix/tracing/integrations-tracing/litellm).
+
+```python
+litellm.callbacks = ["arize_phoenix"]
+```
+```python
+import litellm
+import os
+
+os.environ["PHOENIX_API_KEY"] = "" # Necessary only using Phoenix Cloud
+os.environ["PHOENIX_COLLECTOR_HTTP_ENDPOINT"] = "" # The URL of your Phoenix OSS instance
+# This defaults to https://app.phoenix.arize.com/v1/traces for Phoenix Cloud
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set arize as a callback, litellm will send the data to arize
+litellm.callbacks = ["phoenix"]
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+### Using with LiteLLM Proxy
+
+
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["arize_phoenix"]
+
+environment_variables:
+    PHOENIX_API_KEY: "d0*****"
+    PHOENIX_COLLECTOR_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the GRPC endpoint
+    PHOENIX_COLLECTOR_HTTP_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the HTTP endpoint
+```
+
+## Support & Talk to Founders
+
+- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
+- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
+- Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/pass_through/assembly_ai.md
+++ b/docs/my-website/docs/pass_through/assembly_ai.md
@ -12,6 +12,9 @@ Supports **ALL** Assembly AI Endpoints

 [**See All Assembly AI Endpoints**](https://www.assemblyai.com/docs/api-reference)

+
+<iframe width="840" height="500" src="https://www.loom.com/embed/aac3f4d74592448992254bfa79b9f62d?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
 ## Quick Start

 Let's call the Assembly AI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
@ -35,6 +38,8 @@ litellm
 Let's call the Assembly AI `/v2/transcripts` endpoint

 ```python
+import assemblyai as aai
+
 LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
 LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai

@ -53,3 +58,28 @@ print(transcript)
 print(transcript.id)
 ```

+## Calling Assembly AI EU endpoints
+
+If you want to send your request to the Assembly AI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
+
+
+```python
+import assemblyai as aai
+
+LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
+LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+
+aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
+aai.settings.base_url = LITELLM_PROXY_BASE_URL
+
+# URL of the file to transcribe
+FILE_URL = "https://assembly.ai/wildfires.mp3"
+
+# You can also transcribe a local file by passing in a file path
+# FILE_URL = './path/to/file.mp3'
+
+transcriber = aai.Transcriber()
+transcript = transcriber.transcribe(FILE_URL)
+print(transcript)
+print(transcript.id)
+```
--- a/docs/my-website/docs/pass_through/openai_passthrough.md
+++ b/docs/my-website/docs/pass_through/openai_passthrough.md
@ -0,0 +1,95 @@
+# OpenAI Passthrough
+
+Pass-through endpoints for `/openai`
+
+## Overview
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ❌ | Not supported |
+| Logging | ✅ | Works across all integrations |
+| Streaming | ✅ | Fully supported |
+
+### When to use this?
+
+- For 90% of your use cases, you should use the [native LiteLLM OpenAI Integration](https://docs.litellm.ai/docs/providers/openai) (`/chat/completions`, `/embeddings`, `/completions`, `/images`, `/batches`, etc.)
+- Use this passthrough to call less popular or newer OpenAI endpoints that LiteLLM doesn't fully support yet, such as `/assistants`, `/threads`, `/vector_stores`
+
+Simply replace `https://api.openai.com` with `LITELLM_PROXY_BASE_URL/openai`
+
+## Usage Examples
+
+### Assistants API
+
+#### Create OpenAI Client
+
+Make sure you do the following:
+- Point `base_url` to your `LITELLM_PROXY_BASE_URL/openai`
+- Use your `LITELLM_API_KEY` as the `api_key`
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://0.0.0.0:4000/openai",  # <your-proxy-url>/openai
+    api_key="sk-anything"  # <your-proxy-api-key>
+)
+```
+
+#### Create an Assistant
+
+```python
+# Create an assistant
+assistant = client.beta.assistants.create(
+    name="Math Tutor",
+    instructions="You are a math tutor. Help solve equations.",
+    model="gpt-4o",
+)
+```
+
+#### Create a Thread
+```python
+# Create a thread
+thread = client.beta.threads.create()
+```
+
+#### Add a Message to the Thread
+```python
+# Add a message
+message = client.beta.threads.messages.create(
+    thread_id=thread.id,
+    role="user",
+    content="Solve 3x + 11 = 14",
+)
+```
+
+#### Run the Assistant
+```python
+# Create a run to get the assistant's response
+run = client.beta.threads.runs.create(
+    thread_id=thread.id,
+    assistant_id=assistant.id,
+)
+
+# Check run status
+run_status = client.beta.threads.runs.retrieve(
+    thread_id=thread.id,
+    run_id=run.id
+)
+```
+
+#### Retrieve Messages
+```python
+# List messages after the run completes
+messages = client.beta.threads.messages.list(
+    thread_id=thread.id
+)
+```
+
+#### Delete the Assistant
+
+```python
+# Delete the assistant when done
+client.beta.assistants.delete(assistant.id)
+```
+
--- a/docs/my-website/docs/projects/Elroy.md
+++ b/docs/my-website/docs/projects/Elroy.md
@ -0,0 +1,14 @@
+# 🐕 Elroy
+
+Elroy is a scriptable AI assistant that remembers and sets goals.
+
+Interact through the command line, share memories via MCP, or build your own tools using Python.
+
+
+[![Static Badge][github-shield]][github-url]
+[![Discord][discord-shield]][discord-url]
+
+[github-shield]: https://img.shields.io/badge/Github-repo-white?logo=github
+[github-url]: https://github.com/elroy-bot/elroy
+[discord-shield]:https://img.shields.io/discord/1200684659277832293?color=7289DA&label=Discord&logo=discord&logoColor=white
+[discord-url]: https://discord.gg/5PJUY4eMce
--- a/docs/my-website/docs/projects/PDL.md
+++ b/docs/my-website/docs/projects/PDL.md
@ -0,0 +1,5 @@
+PDL - A YAML-based approach to prompt programming
+
+Github: https://github.com/IBM/prompt-declaration-language
+
+PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.
--- a/docs/my-website/docs/projects/pgai.md
+++ b/docs/my-website/docs/projects/pgai.md
@ -0,0 +1,9 @@
+# pgai
+
+[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
+
+If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
+
+If you're already familiar with pgai, you can find litellm specific docs here:
+- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
+- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.
--- a/docs/my-website/docs/providers/aiml.md
+++ b/docs/my-website/docs/providers/aiml.md
@ -0,0 +1,160 @@
+# AI/ML API  
+
+Getting started with the AI/ML API is simple. Follow these steps to set up your integration:
+
+### 1. Get Your API Key  
+To begin, you need an API key. You can obtain yours here:  
+🔑 [Get Your API Key](https://aimlapi.com/app/keys/?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)  
+
+### 2. Explore Available Models  
+Looking for a different model? Browse the full list of supported models:  
+📚 [Full List of Models](https://docs.aimlapi.com/api-overview/model-database/text-models?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)  
+
+### 3. Read the Documentation  
+For detailed setup instructions and usage guidelines, check out the official documentation:  
+📖 [AI/ML API Docs](https://docs.aimlapi.com/quickstart/setting-up?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)  
+
+### 4. Need Help?  
+If you have any questions, feel free to reach out. We’re happy to assist! 🚀  [Discord](https://discord.gg/hvaUsJpVJf)
+
+## Usage
+You can choose from LLama, Qwen, Flux, and 200+ other open and closed-source models on aimlapi.com/models. For example:
+
+```python
+import litellm
+
+response = litellm.completion(
+    model="openai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", # The model name must include prefix "openai" + the model name from ai/ml api
+    api_key="", # your aiml api-key 
+    api_base="https://api.aimlapi.com/v2",
+    messages=[
+        {
+            "role": "user",
+            "content": "Hey, how's it going?",
+        }
+    ],
+)
+```
+
+## Streaming
+
+```python
+import litellm
+
+response = litellm.completion(
+    model="openai/Qwen/Qwen2-72B-Instruct",  # The model name must include prefix "openai" + the model name from ai/ml api
+    api_key="",  # your aiml api-key 
+    api_base="https://api.aimlapi.com/v2",
+    messages=[
+        {
+            "role": "user",
+            "content": "Hey, how's it going?",
+        }
+    ],
+    stream=True,
+)
+for chunk in response:
+    print(chunk)
+```
+
+## Async Completion
+
+```python
+import asyncio
+
+import litellm
+
+
+async def main():
+    response = await litellm.acompletion(
+        model="openai/anthropic/claude-3-5-haiku",  # The model name must include prefix "openai" + the model name from ai/ml api
+        api_key="",  # your aiml api-key
+        api_base="https://api.aimlapi.com/v2",
+        messages=[
+            {
+                "role": "user",
+                "content": "Hey, how's it going?",
+            }
+        ],
+    )
+    print(response)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Async Streaming
+
+```python
+import asyncio
+import traceback
+
+import litellm
+
+
+async def main():
+    try:
+        print("test acompletion + streaming")
+        response = await litellm.acompletion(
+            model="openai/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", # The model name must include prefix "openai" + the model name from ai/ml api
+            api_key="", # your aiml api-key
+            api_base="https://api.aimlapi.com/v2",
+            messages=[{"content": "Hey, how's it going?", "role": "user"}],
+            stream=True,
+        )
+        print(f"response: {response}")
+        async for chunk in response:
+            print(chunk)
+    except:
+        print(f"error occurred: {traceback.format_exc()}")
+        pass
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Async Embedding
+
+```python
+import asyncio
+
+import litellm
+
+
+async def main():
+    response = await litellm.aembedding(
+        model="openai/text-embedding-3-small", # The model name must include prefix "openai" + the model name from ai/ml api
+        api_key="",  # your aiml api-key
+        api_base="https://api.aimlapi.com/v1", # 👈 the URL has changed from v2 to v1
+        input="Your text string",
+    )
+    print(response)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Async Image Generation
+
+```python
+import asyncio
+
+import litellm
+
+
+async def main():
+    response = await litellm.aimage_generation(
+        model="openai/dall-e-3",  # The model name must include prefix "openai" + the model name from ai/ml api
+        api_key="",  # your aiml api-key
+        api_base="https://api.aimlapi.com/v1", # 👈 the URL has changed from v2 to v1
+        prompt="A cute baby sea otter",
+    )
+    print(response)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -819,6 +819,114 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```

+## Usage - Thinking / `reasoning_content`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="anthropic/claude-3-7-sonnet-20250219",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet-20250219
+  litellm_params:
+    model: anthropic/claude-3-7-sonnet-20250219
+    api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
 ## **Passing Extra Headers to Anthropic API**

 Pass `extra_headers: dict` to `litellm.completion`
@ -987,6 +1095,106 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+## [BETA] Citations API 
+
+Pass `citations: {"enabled": true}` to Anthropic, to get citations on your document responses. 
+
+Note: This interface is in BETA. If you have feedback on how citations should be returned, please [tell us here](https://github.com/BerriAI/litellm/issues/7970#issuecomment-2644437943)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="claude-3-5-sonnet-20241022",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "document",
+                    "source": {
+                        "type": "text",
+                        "media_type": "text/plain",
+                        "data": "The grass is green. The sky is blue.",
+                    },
+                    "title": "My Document",
+                    "context": "This is a trustworthy document.",
+                    "citations": {"enabled": True},
+                },
+                {
+                    "type": "text",
+                    "text": "What color is the grass and sky?",
+                },
+            ],
+        }
+    ],
+)
+
+citations = resp.choices[0].message.provider_specific_fields["citations"]
+
+assert citations is not None
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20241022
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "document",
+                "source": {
+                    "type": "text",
+                    "media_type": "text/plain",
+                    "data": "The grass is green. The sky is blue.",
+                },
+                "title": "My Document",
+                "context": "This is a trustworthy document.",
+                "citations": {"enabled": True},
+            },
+            {
+                "type": "text",
+                "text": "What color is the grass and sky?",
+            },
+        ],
+    }
+  ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
 ## Usage - passing 'user_id' to Anthropic

 LiteLLM translates the OpenAI `user` param to Anthropic's `metadata[user_id]` param.
@ -1035,3 +1243,4 @@ curl http://0.0.0.0:4000/v1/chat/completions \

 </TabItem>
 </Tabs>
+
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -7,9 +7,10 @@ ALL Bedrock models (Anthropic, Meta, Deepseek, Mistral, Amazon, etc.) are Suppor
 | Property | Details |
 |-------|-------|
 | Description | Amazon Bedrock is a fully managed service that offers a choice of high-performing foundation models (FMs). |
-| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#bedrock-imported-models-deepseek) |
+| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1) |
 | Provider Doc | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) |
 | Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings`, `/images/generations` |
+| Rerank Endpoint | `/rerank` |
 | Pass-through Endpoint | [Supported](../pass_through/bedrock.md) |


@ -285,9 +286,12 @@ print(response)
 </TabItem>
 </Tabs>

-## Usage - Function Calling 
+## Usage - Function Calling / Tool calling

-LiteLLM uses Bedrock's Converse API for making tool calls
+LiteLLM supports tool calling via Bedrock's Converse and Invoke API's.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">

 ```python
 from litellm import completion
@ -332,6 +336,69 @@ assert isinstance(
    response.choices[0].message.tool_calls[0].function.arguments, str
 )
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # for bedrock invoke, specify `bedrock/invoke/<model>`
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $LITELLM_API_KEY" \
+-d '{
+  "model": "bedrock-claude-3-7",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What'\''s the weather like in Boston today?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto"
+}'
+
+```
+
+
+</TabItem>
+</Tabs>


 ## Usage - Vision 
@ -376,6 +443,226 @@ print(f"\nResponse: {resp}")
 ```


+## Usage - 'thinking' / 'reasoning content'
+
+This is currently only supported for Anthropic's Claude 3.7 Sonnet + Deepseek R1.
+
+Works on v1.61.20+.
+
+Returns 2 new fields in `message` and `delta` object:
+- `reasoning_content` - string - The reasoning content of the response
+- `thinking_blocks` - list of objects (Anthropic only) - The thinking blocks of the response
+
+Each object has the following fields:
+- `type` - Literal["thinking"] - The type of thinking block
+- `thinking` - string - The thinking of the response. Also returned in `reasoning_content`
+- `signature` - string - A base64 encoded string, returned by Anthropic.
+
+The `signature` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# set env
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+
+resp = completion(
+    model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+
+print(resp)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      thinking: {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON REQUEST
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "bedrock-claude-3-7",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON CONFIG.YAML
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+Same as [Anthropic API response](../providers/anthropic#usage---thinking--reasoning_content).
+
+```python
+{
+    "id": "chatcmpl-c661dfd7-7530-49c9-b0cc-d5018ba4727d",
+    "created": 1740640366,
+    "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "The capital of France is Paris. It's not only the capital city but also the largest city in France, serving as the country's major cultural, economic, and political center.",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null,
+                "reasoning_content": "The capital of France is Paris. This is a straightforward factual question.",
+                "thinking_blocks": [
+                    {
+                        "type": "thinking",
+                        "thinking": "The capital of France is Paris. This is a straightforward factual question.",
+                        "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+yCHpBY7U6FQW8/FcoLewocJQPa2HnmLM+NECy50y44F/kD4SULFXi57buI9fAvyBwtyjlOiO0SDE3+r3spdg6PLOo9PBoMma2ku5OTAoR46j9VIjDRlvNmBvff7YW4WI9oU8XagaOBSxLPxElrhyuxppEn7m6bfT40dqBSTDrfiw4FYB4qEPETTI6TA6wtjGAAqmFqKTo="
+                    }
+                ]
+            }
+        }
+    ],
+    "usage": {
+        "completion_tokens": 64,
+        "prompt_tokens": 42,
+        "total_tokens": 106,
+        "completion_tokens_details": null,
+        "prompt_tokens_details": null
+    }
+}
+```
+
+
+## Usage - Structured Output / JSON mode 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os 
+from pydantic import BaseModel
+
+# set env
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+class CalendarEvent(BaseModel):
+  name: str
+  date: str
+  participants: list[str]
+
+class EventsList(BaseModel):
+    events: list[CalendarEvent]
+
+response = completion(
+  model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", # specify invoke via `bedrock/invoke/anthropic.claude-3-7-sonnet-20250219-v1:0`
+  response_format=EventsList,
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+)
+print(response.choices[0].message.content)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # specify invoke via `bedrock/invoke/<model_name>` 
+      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
+      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "bedrock-claude-3-7",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant designed to output JSON."
+      },
+      {
+        "role": "user",
+        "content": "Who won the worlde series in 2020?"
+      }
+    ],
+    "response_format": {
+      "type": "json_schema",
+      "json_schema": {
+        "name": "math_reasoning",
+        "description": "reason about maths",
+        "schema": {
+          "type": "object",
+          "properties": {
+            "steps": {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "properties": {
+                  "explanation": { "type": "string" },
+                  "output": { "type": "string" }
+                },
+                "required": ["explanation", "output"],
+                "additionalProperties": false
+              }
+            },
+            "final_answer": { "type": "string" }
+          },
+          "required": ["steps", "final_answer"],
+          "additionalProperties": false
+        },
+        "strict": true
+      }
+    }
+  }'
+```
+</TabItem>
+</Tabs>
+
 ## Usage - Bedrock Guardrails

 Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
@ -1277,13 +1564,83 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 https://some-api-url/models
 ```

-## Bedrock Imported Models (Deepseek)
+## Bedrock Imported Models (Deepseek, Deepseek R1)
+
+### Deepseek R1
+
+This is a separate route, as the chat template is different.
+
+| Property | Details |
+|----------|---------|
+| Provider Route | `bedrock/deepseek_r1/{model_arn}` |
+| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+response = completion(
+    model="bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n",  # bedrock/deepseek_r1/{your-model-arn}
+    messages=[{"role": "user", "content": "Tell me a joke"}],
+)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="Proxy">
+
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: DeepSeek-R1-Distill-Llama-70B
+      litellm_params:
+        model: bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n
+
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+            "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config
+            "messages": [
+                {
+                "role": "user",
+                "content": "what llm are you"
+                }
+            ],
+        }'
+```
+
+</TabItem>
+</Tabs>
+
+
+### Deepseek (not R1)

 | Property | Details |
 |----------|---------|
 | Provider Route | `bedrock/llama/{model_arn}` |
 | Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) |

+
+
 Use this route to call Bedrock Imported Models that follow the `llama` Invoke Request / Response spec


--- a/docs/my-website/docs/providers/cerebras.md
+++ b/docs/my-website/docs/providers/cerebras.md
@ -23,14 +23,16 @@ import os

 os.environ['CEREBRAS_API_KEY'] = ""
 response = completion(
-    model="cerebras/meta/llama3-70b-instruct",
+    model="cerebras/llama3-70b-instruct",
    messages=[
        {
            "role": "user",
-            "content": "What's the weather like in Boston today in Fahrenheit?",
+            "content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
        }
    ],
    max_tokens=10,
+        
+    # The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
    response_format={ "type": "json_object" },
    seed=123,
    stop=["\n\n"],
@ -50,15 +52,17 @@ import os

 os.environ['CEREBRAS_API_KEY'] = ""
 response = completion(
-    model="cerebras/meta/llama3-70b-instruct",
+    model="cerebras/llama3-70b-instruct",
    messages=[
        {
            "role": "user",
-            "content": "What's the weather like in Boston today in Fahrenheit?",
+            "content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
        }
    ],
    stream=True,
    max_tokens=10,
+
+    # The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
    response_format={ "type": "json_object" }, 
    seed=123,
    stop=["\n\n"],
--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@ -108,7 +108,7 @@ response = embedding(

 ### Usage

-
+LiteLLM supports the v1 and v2 clients for Cohere rerank. By default, the `rerank` endpoint uses the v2 client, but you can specify the v1 client by explicitly calling `v1/rerank`

 <Tabs>
 <TabItem value="sdk" label="LiteLLM SDK Usage">
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -688,7 +688,9 @@ response = litellm.completion(
 |-----------------------|--------------------------------------------------------|--------------------------------|
 | gemini-pro            | `completion(model='gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
 | gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-pro-vision     | `completion(model='gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash     | `completion(model='gemini/gemini-2.0-flash', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash-exp     | `completion(model='gemini/gemini-2.0-flash-exp', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash-lite-preview-02-05	     | `completion(model='gemini/gemini-2.0-flash-lite-preview-02-05', messages)`     | `os.environ['GEMINI_API_KEY']` |



--- a/docs/my-website/docs/providers/infinity.md
+++ b/docs/my-website/docs/providers/infinity.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Infinity

 | Property | Details |
@ -12,6 +15,9 @@

 ```python
 from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"

 response = rerank(
    model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
 ```


+## Supported Cohere Rerank API Params
+
+| Param | Type | Description |
+|-------|-------|-------|
+| `query` | `str` | The query to rerank the documents against |
+| `documents` | `list[str]` | The documents to rerank |
+| `top_n` | `int` | The number of documents to return |
+| `return_documents` | `bool` | Whether to return the documents in the response |
+
+### Usage - Return Documents
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    return_documents=True,
+)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of France?",
+    "documents": [
+        "Paris",
+        "London",
+        "Berlin",
+        "Madrid"
+    ],
+    "return_documents": True,
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Pass Provider-specific Params
+
+Any unmapped params will be passed to the provider as-is.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
+
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
+)
+```
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: custom-infinity-rerank
+    litellm_params:
+      model: infinity/rerank
+      api_base: https://localhost:8080
+      raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
+```
+
+2. Start litellm
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!  
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of the United States?",
+    "documents": [
+        "Carson City is the capital city of the American state of Nevada.",
+        "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
+        "Washington, D.C. is the capital of the United States.",
+        "Capital punishment has existed in the United States since before it was a country."
+    ],
+    "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
+  }'
+```
+</TabItem>
+
+</Tabs>
--- a/docs/my-website/docs/providers/litellm_proxy.md
+++ b/docs/my-website/docs/providers/litellm_proxy.md
@ -3,13 +3,15 @@ import TabItem from '@theme/TabItem';

 # LiteLLM Proxy (LLM Gateway)

-:::tip

-[LiteLLM Providers a **self hosted** proxy server (AI Gateway)](../simple_proxy) to call all the LLMs in the OpenAI format
+| Property | Details |
+|-------|-------|
+| Description | LiteLLM Proxy is an OpenAI-compatible gateway that allows you to interact with multiple LLM providers through a unified API. Simply use the `litellm_proxy/` prefix before the model name to route your requests through the proxy. |
+| Provider Route on LiteLLM | `litellm_proxy/` (add this prefix to the model name, to route any requests to litellm_proxy - e.g. `litellm_proxy/your-model-name`) |
+| Setup LiteLLM Gateway | [LiteLLM Gateway ↗](../simple_proxy) |
+| Supported Endpoints |`/chat/completions`, `/completions`, `/embeddings`, `/audio/speech`, `/audio/transcriptions`, `/images`, `/rerank` |

-:::

-**[LiteLLM Proxy](../simple_proxy) is OpenAI compatible**, you just need the `litellm_proxy/` prefix before the model

 ## Required Variables

@ -83,7 +85,76 @@ for chunk in response:
    print(chunk)
 ```

+## Embeddings

+```python
+import litellm
+
+response = litellm.embedding(
+    model="litellm_proxy/your-embedding-model",
+    input="Hello world",
+    api_base="your-litellm-proxy-url",
+    api_key="your-litellm-proxy-api-key"
+)
+```
+
+## Image Generation
+
+```python
+import litellm
+
+response = litellm.image_generation(
+    model="litellm_proxy/dall-e-3",
+    prompt="A beautiful sunset over mountains",
+    api_base="your-litellm-proxy-url",
+    api_key="your-litellm-proxy-api-key"
+)
+```
+
+## Audio Transcription
+
+```python
+import litellm
+
+response = litellm.transcription(
+    model="litellm_proxy/whisper-1",
+    file="your-audio-file",
+    api_base="your-litellm-proxy-url",
+    api_key="your-litellm-proxy-api-key"
+)
+```
+
+## Text to Speech
+
+```python
+import litellm
+
+response = litellm.speech(
+    model="litellm_proxy/tts-1",
+    input="Hello world",
+    api_base="your-litellm-proxy-url",
+    api_key="your-litellm-proxy-api-key"
+)
+``` 
+
+## Rerank
+
+```python
+import litellm
+
+import litellm
+
+response = litellm.rerank(
+    model="litellm_proxy/rerank-english-v2.0",
+    query="What is machine learning?",
+    documents=[
+        "Machine learning is a field of study in artificial intelligence",
+        "Biology is the study of living organisms"
+    ],
+    api_base="your-litellm-proxy-url",
+    api_key="your-litellm-proxy-api-key"
+)
+```
 ## **Usage with Langchain, LLamaindex, OpenAI Js, Anthropic SDK, Instructor**

 #### [Follow this doc to see how to use litellm proxy with langchain, llamaindex, anthropic etc](../proxy/user_keys)
--- a/docs/my-website/docs/providers/perplexity.md
+++ b/docs/my-website/docs/providers/perplexity.md
@ -64,71 +64,7 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported



-## Return citations 
-
-Perplexity supports returning citations via `return_citations=True`. [Perplexity Docs](https://docs.perplexity.ai/reference/post_chat_completions). Note: Perplexity has this feature in **closed beta**, so you need them to grant you access to get citations from their API. 
-
-If perplexity returns citations, LiteLLM will pass it straight through. 
-
 :::info

-For passing more provider-specific, [go here](../completion/provider_specific_params.md)
+For more information about passing provider-specific parameters, [go here](../completion/provider_specific_params.md)
 :::
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import completion
-import os
-
-os.environ['PERPLEXITYAI_API_KEY'] = ""
-response = completion(
-    model="perplexity/mistral-7b-instruct", 
-    messages=messages,
-    return_citations=True
-)
-print(response)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add perplexity to config.yaml
-
-```yaml
-model_list:
-  - model_name: "perplexity-model"
-    litellm_params:
-      model: "llama-3.1-sonar-small-128k-online"
-      api_key: os.environ/PERPLEXITY_API_KEY
-```
-
-2. Start proxy 
-
-```bash
-litellm --config /path/to/config.yaml
-```
-
-3. Test it! 
-
-```bash
-curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
-    "model": "perplexity-model",
-    "messages": [
-      {
-        "role": "user",
-        "content": "Who won the world cup in 2022?"
-      }
-    ],
-    "return_citations": true
-}'
-```
-
-[**Call w/ OpenAI SDK, Langchain, Instructor, etc.**](../proxy/user_keys.md#chatcompletions)
-
-</TabItem>
-</Tabs>
--- a/docs/my-website/docs/providers/sambanova.md
+++ b/docs/my-website/docs/providers/sambanova.md
@ -2,11 +2,11 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 # Sambanova
-https://community.sambanova.ai/t/create-chat-completion-api/
+https://cloud.sambanova.ai/

 :::tip

-**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://sambanova.ai/technology/models **
+**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://docs.sambanova.ai/cloud/docs/get-started/supported-models **

 :::

@ -27,12 +27,11 @@ response = completion(
    messages=[
        {
            "role": "user",
-            "content": "What do you know about sambanova.ai",
+            "content": "What do you know about sambanova.ai. Give your response in json format",
        }
    ],
    max_tokens=10,
    response_format={ "type": "json_object" },
-    seed=123,
    stop=["\n\n"],
    temperature=0.2,
    top_p=0.9,
@ -54,13 +53,12 @@ response = completion(
    messages=[
        {
            "role": "user",
-            "content": "What do you know about sambanova.ai",
+            "content": "What do you know about sambanova.ai. Give your response in json format",
        }
    ],
    stream=True,
    max_tokens=10,
    response_format={ "type": "json_object" },
-    seed=123,
    stop=["\n\n"],
    temperature=0.2,
    top_p=0.9,
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -405,13 +405,15 @@ If this was your initial VertexAI Grounding code,

 ```python
 import vertexai
+from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
+

 vertexai.init(project=project_id, location="us-central1")

 model = GenerativeModel("gemini-1.5-flash-001")

 # Use Google Search for grounding
-tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
+tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
@ -852,6 +854,7 @@ litellm.vertex_location = "us-central1 # Your Location
 | claude-3-5-sonnet@20240620  | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
+| claude-3-7-sonnet@20250219   | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |

 ### Usage

@ -926,6 +929,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>


+
+### Usage - `thinking` / `reasoning_content`
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="vertex_ai/claude-3-7-sonnet-20250219",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet-20250219
+  litellm_params:
+    model: vertex_ai/claude-3-7-sonnet-20250219
+    vertex_ai_project: "my-test-project"
+    vertex_ai_location: "us-west-1"
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+
+
 ## Llama 3 API
 
 | Model Name       | Function Call                        |
@ -1572,6 +1688,14 @@ assert isinstance(

 Pass any file supported by Vertex AI, through LiteLLM. 

+LiteLLM Supports the following image types passed in url
+
+```
+Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
+Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
+Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
+Base64 Encoded Local Images
+```

 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
 </TabItem>
 </Tabs>

+## Send Video URL to VLLM
+
+Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
+
+There are two ways to send a video url to VLLM:
+
+1. Pass the video url directly
+
+```
+{"type": "video_url", "video_url": {"url": video_url}},
+```
+
+2. Pass the video data as base64
+
+```
+{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+response = completion(
+            model="hosted_vllm/qwen", # pass the vllm model name
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Summarize the following video"
+                        },
+                        {
+                            "type": "video_url",
+                            "video_url": {
+                                "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+                            }
+                        }
+                    ]
+                }
+            ],
+            api_base="https://hosted-vllm-api.co")
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
+        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -X POST http://0.0.0.0:4000/chat/completions \
+-H "Authorization: Bearer sk-1234" \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "my-model",
+    "messages": [
+        {"role": "user", "content": 
+            [
+                {"type": "text", "text": "Summarize the following video"},
+                {"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
+            ]
+        }
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`

--- a/docs/my-website/docs/providers/xai.md
+++ b/docs/my-website/docs/providers/xai.md
@ -1,13 +1,13 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# XAI
+# xAI

 https://docs.x.ai/docs

 :::tip

-**We support ALL XAI models, just set `model=xai/<any-model-on-xai>` as a prefix when sending litellm requests**
+**We support ALL xAI models, just set `model=xai/<any-model-on-xai>` as a prefix when sending litellm requests**

 :::

--- a/docs/my-website/docs/proxy/architecture.md
+++ b/docs/my-website/docs/proxy/architecture.md
@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
        - Virtual Key Rate Limit
        - User Rate Limit
        - Team Limit
-    - The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
+    - The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)

 ## Frequently Asked Questions

--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -2,7 +2,6 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 # Caching 
-Cache LLM Responses

 :::note 

@ -10,14 +9,19 @@ For OpenAI/Anthropic Prompt Caching, go [here](../completion/prompt_caching.md)

 :::

-LiteLLM supports:
+Cache LLM Responses. LiteLLM's caching system stores and reuses LLM responses to save costs and reduce latency. When you make the same request twice, the cached response is returned instead of calling the LLM API again.
+
+
+
+### Supported Caches
+
 - In Memory Cache
 - Redis Cache 
 - Qdrant Semantic Cache
 - Redis Semantic Cache
 - s3 Bucket Cache 

-## Quick Start - Redis, s3 Cache, Semantic Cache
+## Quick Start
 <Tabs>

 <TabItem value="redis" label="redis cache">
@ -369,9 +373,9 @@ $ litellm --config /path/to/config.yaml
 </Tabs>


+## Usage

-
-## Using Caching - /chat/completions
+### Basic

 <Tabs>
 <TabItem value="chat_completions" label="/chat/completions">
@ -416,6 +420,239 @@ curl --location 'http://0.0.0.0:4000/embeddings' \
 </TabItem>
 </Tabs>

+### Dynamic Cache Controls
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `ttl` | *Optional(int)* | Will cache the response for the user-defined amount of time (in seconds) |
+| `s-maxage` | *Optional(int)* | Will only accept cached responses that are within user-defined range (in seconds) |
+| `no-cache` | *Optional(bool)* | Will not store the response in cache. |
+| `no-store` | *Optional(bool)* | Will not cache the response |
+| `namespace` | *Optional(str)* | Will cache the response under a user-defined namespace |
+
+Each cache parameter can be controlled on a per-request basis. Here are examples for each parameter:
+
+### `ttl`
+
+Set how long (in seconds) to cache a response.
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-api-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[{"role": "user", "content": "Hello"}],
+    model="gpt-3.5-turbo",
+    extra_body={
+        "cache": {
+            "ttl": 300  # Cache response for 5 minutes
+        }
+    }
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "cache": {"ttl": 300},
+    "messages": [
+      {"role": "user", "content": "Hello"}
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+### `s-maxage`
+
+Only accept cached responses that are within the specified age (in seconds).
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-api-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[{"role": "user", "content": "Hello"}],
+    model="gpt-3.5-turbo",
+    extra_body={
+        "cache": {
+            "s-maxage": 600  # Only use cache if less than 10 minutes old
+        }
+    }
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "cache": {"s-maxage": 600},
+    "messages": [
+      {"role": "user", "content": "Hello"}
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+### `no-cache`
+Force a fresh response, bypassing the cache.
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-api-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[{"role": "user", "content": "Hello"}],
+    model="gpt-3.5-turbo",
+    extra_body={
+        "cache": {
+            "no-cache": True  # Skip cache check, get fresh response
+        }
+    }
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "cache": {"no-cache": true},
+    "messages": [
+      {"role": "user", "content": "Hello"}
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+### `no-store`
+
+Will not store the response in cache.
+
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-api-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[{"role": "user", "content": "Hello"}],
+    model="gpt-3.5-turbo",
+    extra_body={
+        "cache": {
+            "no-store": True  # Don't cache this response
+        }
+    }
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "cache": {"no-store": true},
+    "messages": [
+      {"role": "user", "content": "Hello"}
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+### `namespace`
+Store the response under a specific cache namespace.
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-api-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[{"role": "user", "content": "Hello"}],
+    model="gpt-3.5-turbo",
+    extra_body={
+        "cache": {
+            "namespace": "my-custom-namespace"  # Store in custom namespace
+        }
+    }
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "cache": {"namespace": "my-custom-namespace"},
+    "messages": [
+      {"role": "user", "content": "Hello"}
+    ]
+  }'
+```
+</TabItem>
+</Tabs>
+
+
+
 ## Set cache for proxy, but not on the actual llm api call

 Use this if you just want to enable features like rate limiting, and loadbalancing across multiple instances.
@ -501,253 +738,6 @@ litellm_settings:
                      # /chat/completions, /completions, /embeddings, /audio/transcriptions
 ```

-### **Turn on / off caching per request. **
-
-The proxy support 4 cache-controls:
-
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
- `no-store`: *Optional(bool)* Will not cache the response. 
-
-[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
-
-**Turn off caching**
-
-Set `no-cache=True`, this will not return a cached response
-
-<Tabs>
-<TabItem value="openai" label="OpenAI Python SDK">
-
-```python
-import os
-from openai import OpenAI
-
-client = OpenAI(
-    # This is the default and can be omitted
-    api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:4000"
-)
-
-chat_completion = client.chat.completions.create(
-    messages=[
-        {
-            "role": "user",
-            "content": "Say this is a test",
-        }
-    ],
-    model="gpt-3.5-turbo",
-    extra_body = {        # OpenAI python accepts extra args in extra_body
-        cache: {
-          "no-cache": True # will not return a cached response 
-      }
-    }
-)
-```
-</TabItem>
-
-<TabItem value="curl" label="curl">
-
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "gpt-3.5-turbo",
-    "cache": {"no-cache": True},
-    "messages": [
-      {"role": "user", "content": "Say this is a test"}
-    ]
-  }'
-```
-
-</TabItem>
-
-</Tabs>
-
-**Turn on caching**
-
-By default cache is always on
-
-<Tabs>
-<TabItem value="openai" label="OpenAI Python SDK">
-
-```python
-import os
-from openai import OpenAI
-
-client = OpenAI(
-    # This is the default and can be omitted
-    api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:4000"
-)
-
-chat_completion = client.chat.completions.create(
-    messages=[
-        {
-            "role": "user",
-            "content": "Say this is a test",
-        }
-    ],
-    model="gpt-3.5-turbo"
-)
-```
-</TabItem>
-
-<TabItem value="curl on" label="curl">
-
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-      {"role": "user", "content": "Say this is a test"}
-    ]
-  }'
-```
-
-</TabItem>
-
-</Tabs>
-
-**Set `ttl`**
-
-Set `ttl=600`, this will caches response for 10 minutes (600 seconds)
-
-<Tabs>
-<TabItem value="openai" label="OpenAI Python SDK">
-
-```python
-import os
-from openai import OpenAI
-
-client = OpenAI(
-    # This is the default and can be omitted
-    api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:4000"
-)
-
-chat_completion = client.chat.completions.create(
-    messages=[
-        {
-            "role": "user",
-            "content": "Say this is a test",
-        }
-    ],
-    model="gpt-3.5-turbo",
-    extra_body = {        # OpenAI python accepts extra args in extra_body
-        cache: {
-          "ttl": 600 # caches response for 10 minutes 
-      }
-    }
-)
-```
-</TabItem>
-
-<TabItem value="curl on" label="curl">
-
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "gpt-3.5-turbo",
-    "cache": {"ttl": 600},
-    "messages": [
-      {"role": "user", "content": "Say this is a test"}
-    ]
-  }'
-```
-
-</TabItem>
-
-</Tabs>
-
-
-
-**Set `s-maxage`**
-
-Set `s-maxage`, this will only get responses cached within last 10 minutes 
-
-<Tabs>
-<TabItem value="openai" label="OpenAI Python SDK">
-
-```python
-import os
-from openai import OpenAI
-
-client = OpenAI(
-    # This is the default and can be omitted
-    api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:4000"
-)
-
-chat_completion = client.chat.completions.create(
-    messages=[
-        {
-            "role": "user",
-            "content": "Say this is a test",
-        }
-    ],
-    model="gpt-3.5-turbo",
-    extra_body = {        # OpenAI python accepts extra args in extra_body
-        cache: {
-          "s-maxage": 600 # only get responses cached within last 10 minutes 
-      }
-    }
-)
-```
-</TabItem>
-
-<TabItem value="curl on" label="curl">
-
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "gpt-3.5-turbo",
-    "cache": {"s-maxage": 600},
-    "messages": [
-      {"role": "user", "content": "Say this is a test"}
-    ]
-  }'
-```
-
-</TabItem>
-
-</Tabs>
-
-
-### Turn on / off caching per Key.
-
-1. Add cache params when creating a key [full list](#turn-on--off-caching-per-key)
-
-```bash 
-curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
-    "user_id": "222",
-    "metadata": {
-        "cache": {
-            "no-cache": true
-        }
-    }
-}'
-```
-
-2. Test it! 
-
-```bash 
-curl -X POST 'http://localhost:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <YOUR_NEW_KEY>' \
-d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "bom dia"}]}'
-```
-
 ### Deleting Cache Keys - `/cache/delete` 
 In order to delete a cache key, send a request to `/cache/delete` with the `keys` you want to delete

--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -466,6 +466,9 @@ router_settings:
 | OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
 | OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
 | PAGERDUTY_API_KEY | API key for PagerDuty Alerting
+| PHOENIX_API_KEY | API key for Arize Phoenix
+| PHOENIX_COLLECTOR_ENDPOINT | API endpoint for Arize Phoenix
+| PHOENIX_COLLECTOR_HTTP_ENDPOINT | API http endpoint for Arize Phoenix
 | POD_NAME | Pod name for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) as `POD_NAME` 
 | PREDIBASE_API_BASE | Base URL for Predibase API
 | PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
@ -488,12 +491,12 @@ router_settings:
 | SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
 | SLACK_WEBHOOK_URL | Webhook URL for Slack integration
 | SMTP_HOST | Hostname for the SMTP server
-| SMTP_PASSWORD | Password for SMTP authentication
+| SMTP_PASSWORD | Password for SMTP authentication (do not set if SMTP does not require auth)
 | SMTP_PORT | Port number for SMTP server
 | SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
 | SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
 | SMTP_TLS | Flag to enable or disable TLS for SMTP connections
-| SMTP_USERNAME | Username for SMTP authentication
+| SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
 | SPEND_LOGS_URL | URL for retrieving spend logs
 | SSL_CERTIFICATE | Path to the SSL certificate file
 | SSL_VERIFY | Flag to enable or disable SSL certificate verification
--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma

 | Table Name | Description | Row Insert Frequency |
 |------------|-------------|---------------------|
-| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
-| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
+| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |

-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## Disable `LiteLLM_SpendLogs`

 You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.

 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```

 ### What is the impact of disabling these logs?
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -14,7 +14,7 @@ Features:
 - **Security**
    - ✅ [SSO for Admin UI](./ui.md#✨-enterprise-features)
    - ✅ [Audit Logs with retention policy](#audit-logs)
-    - ✅ [JWT-Auth](../docs/proxy/token_auth.md)
+    - ✅ [JWT-Auth](./token_auth.md)
    - ✅ [Control available public, private routes (Restrict certain endpoints on proxy)](#control-available-public-private-routes)
    - ✅ [Control available public, private routes](#control-available-public-private-routes)
    - ✅ [Secret Managers - AWS Key Manager, Google Secret Manager, Azure Key, Hashicorp Vault](../secret)
@ -40,8 +40,8 @@ Features:
 - **Control Guardrails per API Key**
 - **Custom Branding**
    - ✅ [Custom Branding + Routes on Swagger Docs](#swagger-docs---custom-routes--branding)
-    - ✅ [Public Model Hub](../docs/proxy/enterprise.md#public-model-hub)
-    - ✅ [Custom Email Branding](../docs/proxy/email.md#customizing-email-branding)
+    - ✅ [Public Model Hub](#public-model-hub)
+    - ✅ [Custom Email Branding](./email.md#customizing-email-branding)

 ## Audit Logs

--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -37,7 +37,7 @@ guardrails:
  - guardrail_name: aim-protected-app
    litellm_params:
      guardrail: aim
-      mode: pre_call
+      mode: pre_call # 'during_call' is also available
      api_key: os.environ/AIM_API_KEY
      api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1,3 +1,7 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Logging

 Log Proxy input, output, and exceptions using:
@ -13,9 +17,7 @@ Log Proxy input, output, and exceptions using:
 - DynamoDB
 - etc.

-import Image from '@theme/IdealImage';
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
+

 ## Getting the LiteLLM Call ID

@ -77,10 +79,13 @@ litellm_settings:

 ### Redact Messages, Response Content

-Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata will still be logged.
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata - e.g. spend, will still be tracked.

+<Tabs>

-Example config.yaml
+<TabItem value="global" label="Global">
+
+**1. Setup config.yaml **
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -91,9 +96,87 @@ litellm_settings:
  turn_off_message_logging: True # 👈 Key Change
 ```

-If you have this feature turned on, you can override it for specific requests by
+**2. Send request**
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ]
+}'
+```
+
+
+
+</TabItem>
+<TabItem value="dynamic" label="Per Request">
+
+:::info
+
+Dynamic request message redaction is in BETA. 
+
+:::
+
+Pass in a request header to enable message redaction for a request.
+
+```
+x-litellm-enable-message-redaction: true
+```
+
+Example config.yaml
+
+**1. Setup config.yaml **
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+```
+
+**2. Setup per request header**
+
+```shell
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-zV5HlSIm8ihj1F9C_ZbB1g' \
+-H 'x-litellm-enable-message-redaction: true' \
+-d '{
+  "model": "gpt-3.5-turbo-testing",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hey, how'\''s it going 1234?"
+    }
+  ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+**3. Check Logging Tool + Spend Logs**
+
+**Logging Tool**
+
+<Image img={require('../../img/message_redaction_logging.png')}/>
+
+**Spend Logs**
+
+<Image img={require('../../img/message_redaction_spend_logs.png')} />
+
+
+### Disable Message Redaction
+
+If you have `litellm.turn_on_message_logging` turned on, you can override it for specific requests by
 setting a request header `LiteLLM-Disable-Message-Redaction: true`.

+
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -109,8 +192,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```

-Removes any field with `user_api_key_*` from metadata.
-

 ### Turn off all tracking/logging

--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `api_base` | `Optional[str]` | Optional API base URL |
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
+| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |

 ## StandardLoggingModelInformation

--- a/docs/my-website/docs/proxy/master_key_rotations.md
+++ b/docs/my-website/docs/proxy/master_key_rotations.md
@ -0,0 +1,53 @@
+# Rotating Master Key
+
+Here are our recommended steps for rotating your master key.
+
+
+**1. Backup your DB**
+In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
+
+**2. Call `/key/regenerate` with the new master key**
+
+```bash
+curl -L -X POST 'http://localhost:4000/key/regenerate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{
+  "key": "sk-1234",
+  "new_master_key": "sk-PIp1h0RekR"
+}'
+```
+
+This will re-encrypt any models in your Proxy_ModelTable with the new master key.
+
+Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
+
+```bash
+   raise Exception("Unable to decrypt value={}".format(v))
+Exception: Unable to decrypt value=<new-encrypted-value>
+```
+
+**3. Update LITELLM_MASTER_KEY**
+
+In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
+
+This ensures the key used for decryption from db is the new key.
+
+**4. Test it**
+
+Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
+
+```bash
+ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
+    "messages": [
+        {
+            "content": "Hey, how's it going",
+            "role": "user"
+        }
+    ],
+}'
+```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -107,9 +107,9 @@ general_settings:

 By default, LiteLLM writes several types of logs to the database:
 - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+- LLM Exceptions to the `LiteLLM_SpendLogs` table

-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:

 ```yaml
 general_settings:
--- a/docs/my-website/docs/proxy/release_cycle.md
+++ b/docs/my-website/docs/proxy/release_cycle.md
@ -0,0 +1,12 @@
+# Release Cycle
+
+Litellm Proxy has the following release cycle:
+
+- `v1.x.x-nightly`: These are releases which pass ci/cd. 
+- `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
+- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
+
+In production, we recommend using the latest `v1.x.x` release.
+
+
+Follow our release notes [here](https://github.com/BerriAI/litellm/releases).
--- a/docs/my-website/docs/proxy/request_headers.md
+++ b/docs/my-website/docs/proxy/request_headers.md
@ -6,7 +6,18 @@ Special headers that are supported by LiteLLM.

 `x-litellm-timeout` Optional[float]: The timeout for the request in seconds.

+`x-litellm-enable-message-redaction`: Optional[bool]: Don't log the message content to logging integrations. Just track spend. [Learn More](./logging#redact-messages-response-content)
+
+`x-litellm-tags`: Optional[str]: A comma separated list (e.g. `tag1,tag2,tag3`) of tags to use for [tag-based routing](./tag_routing) **OR** [spend-tracking](./enterprise.md#tracking-spend-for-custom-tags).
+
 ## Anthropic Headers

 `anthropic-version` Optional[str]: The version of the Anthropic API to use.  
 `anthropic-beta` Optional[str]: The beta version of the Anthropic API to use.
+
+## OpenAI Headers
+
+`openai-organization` Optional[str]: The organization to use for the OpenAI API. (currently needs to be enabled via `general_settings::forward_openai_org_id: true`)
+
+
+
--- a/docs/my-website/docs/proxy/response_headers.md
+++ b/docs/my-website/docs/proxy/response_headers.md
@ -1,17 +1,20 @@
-# Rate Limit Headers
+# Response Headers

-When you make a request to the proxy, the proxy will return the following [OpenAI-compatible headers](https://platform.openai.com/docs/guides/rate-limits/rate-limits-in-headers):
+When you make a request to the proxy, the proxy will return the following headers:

- `x-ratelimit-remaining-requests` - Optional[int]: The remaining number of requests that are permitted before exhausting the rate limit.
- `x-ratelimit-remaining-tokens` - Optional[int]: The remaining number of tokens that are permitted before exhausting the rate limit.
- `x-ratelimit-limit-requests` - Optional[int]: The maximum number of requests that are permitted before exhausting the rate limit.
- `x-ratelimit-limit-tokens` - Optional[int]: The maximum number of tokens that are permitted before exhausting the rate limit.
- `x-ratelimit-reset-requests` - Optional[int]: The time at which the rate limit will reset.    
- `x-ratelimit-reset-tokens` - Optional[int]: The time at which the rate limit will reset.
+## Rate Limit Headers
+[OpenAI-compatible headers](https://platform.openai.com/docs/guides/rate-limits/rate-limits-in-headers):

-These headers are useful for clients to understand the current rate limit status and adjust their request rate accordingly.
+| Header | Type | Description |
+|--------|------|-------------|
+| `x-ratelimit-remaining-requests` | Optional[int] | The remaining number of requests that are permitted before exhausting the rate limit |
+| `x-ratelimit-remaining-tokens` | Optional[int] | The remaining number of tokens that are permitted before exhausting the rate limit |
+| `x-ratelimit-limit-requests` | Optional[int] | The maximum number of requests that are permitted before exhausting the rate limit |
+| `x-ratelimit-limit-tokens` | Optional[int] | The maximum number of tokens that are permitted before exhausting the rate limit |
+| `x-ratelimit-reset-requests` | Optional[int] | The time at which the rate limit will reset |
+| `x-ratelimit-reset-tokens` | Optional[int] | The time at which the rate limit will reset |

-## How are these headers calculated?
+### How Rate Limit Headers work

 **If key has rate limits set**

@ -19,6 +22,50 @@ The proxy will return the [remaining rate limits for that key](https://github.co

 **If key does not have rate limits set**

-The proxy returns the remaining requests/tokens returned by the backend provider. 
+The proxy returns the remaining requests/tokens returned by the backend provider. (LiteLLM will standardize the backend provider's response headers to match the OpenAI format)

 If the backend provider does not return these headers, the value will be `None`.
+
+These headers are useful for clients to understand the current rate limit status and adjust their request rate accordingly.
+
+
+## Latency Headers
+| Header | Type | Description |
+|--------|------|-------------|
+| `x-litellm-response-duration-ms` | float | Total duration of the API response in milliseconds |
+| `x-litellm-overhead-duration-ms` | float | LiteLLM processing overhead in milliseconds |
+
+## Retry, Fallback Headers
+| Header | Type | Description |
+|--------|------|-------------|
+| `x-litellm-attempted-retries` | int | Number of retry attempts made |
+| `x-litellm-attempted-fallbacks` | int | Number of fallback attempts made |
+| `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
+
+## Cost Tracking Headers
+| Header | Type | Description |
+|--------|------|-------------|
+| `x-litellm-response-cost` | float | Cost of the API call |
+| `x-litellm-key-spend` | float | Total spend for the API key |
+
+## LiteLLM Specific Headers
+| Header | Type | Description |
+|--------|------|-------------|
+| `x-litellm-call-id` | string | Unique identifier for the API call |
+| `x-litellm-model-id` | string | Unique identifier for the model used |
+| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
+| `x-litellm-version` | string | Version of LiteLLM being used |
+| `x-litellm-model-group` | string | Model group identifier |
+
+## Response headers from LLM providers
+
+LiteLLM also returns the original response headers from the LLM provider. These headers are prefixed with `llm_provider-` to distinguish them from LiteLLM's headers.
+
+Example response headers:
+```
+llm_provider-openai-processing-ms: 256
+llm_provider-openai-version: 2020-10-01
+llm_provider-x-ratelimit-limit-requests: 30000
+llm_provider-x-ratelimit-limit-tokens: 150000000
+```
+
--- a/docs/my-website/docs/proxy/tag_routing.md
+++ b/docs/my-website/docs/proxy/tag_routing.md
@ -143,6 +143,26 @@ Response
 }
 ```

+## Calling via Request Header
+
+You can also call via request header `x-litellm-tags`
+
+```shell
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'x-litellm-tags: free,my-custom-tag' \
+-d '{
+  "model": "gpt-4",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hey, how'\''s it going 123456?"
+    }
+  ]
+}'
+```
+
 ## Setting Default Tags 

 Use this if you want all untagged requests to be routed to specific deployments
--- a/docs/my-website/docs/proxy/timeout.md
+++ b/docs/my-website/docs/proxy/timeout.md
@ -166,7 +166,7 @@ response = client.chat.completions.create(
        {"role": "user", "content": "what color is red"}
    ],
    logit_bias={12481: 100},
-    timeout=1
+    extra_body={"timeout": 1} # 👈 KEY CHANGE
 )

 print(response)
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # OIDC - JWT-based Auth 

-Use JWT's to auth admins / projects into the proxy.
+Use JWT's to auth admins / users / projects into the proxy.

 :::info

@ -156,27 +156,6 @@ scope: ["litellm-proxy-admin",...]
 scope: "litellm-proxy-admin ..."
 ```

-## Control Model Access with Roles
-
-Reject a JWT token if it's valid but doesn't have the required scopes / fields.
-
-Only tokens which with valid Admin (`admin_jwt_scope`), User (`user_id_jwt_field`), Team (`team_id_jwt_field`) are allowed.
-
-```yaml
-general_settings:
-  enable_jwt_auth: True 
-  litellm_jwtauth:
-    user_roles_jwt_field: "resource_access.litellm-test-client-id.roles"
-    user_allowed_roles: ["basic_user"] # roles that map to an 'internal_user' role on LiteLLM 
-    enforce_rbac: true # if true, will check if the user has the correct role to access the model + endpoint
-  
-  role_permissions: # control what models + endpointsare allowed for each role
-    - role: internal_user
-      models: ["anthropic-claude"]
-```
-
-**[Architecture Diagram (Control Model Access)](./jwt_auth_arch)**
-
 ## Control model access with Teams


@ -184,10 +163,12 @@ general_settings:

 ```yaml
 general_settings:
-  master_key: sk-1234
+  enable_jwt_auth: True
  litellm_jwtauth:
    user_id_jwt_field: "sub"
    team_ids_jwt_field: "groups" 
+    user_id_upsert: true # add user_id to the db if they don't exist
+    enforce_team_based_model_access: true # don't allow users to access models unless the team has access
 ```

 This is assuming your token looks like this:
@ -226,6 +207,64 @@ OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a
 - If all checks pass, allow the request


+## Advanced - Custom Validate
+
+Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.
+
+### 1. Setup custom validate function
+
+```python
+from typing import Literal
+
+def my_custom_validate(token: str) -> Literal[True]:
+  """
+  Only allow tokens with tenant-id == "my-unique-tenant", and claims == ["proxy-admin"]
+  """
+  allowed_tenants = ["my-unique-tenant"]
+  allowed_claims = ["proxy-admin"]
+
+  if token["tenant_id"] not in allowed_tenants:
+    raise Exception("Invalid JWT token")
+  if token["claims"] not in allowed_claims:
+    raise Exception("Invalid JWT token")
+  return True
+```
+
+### 2. Setup config.yaml
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    user_id_jwt_field: "sub"
+    team_id_jwt_field: "tenant_id"
+    user_id_upsert: True
+    custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
+```
+
+### 3. Test the flow
+
+**Expected JWT**
+
+```
+{
+  "sub": "my-unique-user",
+  "tenant_id": "INVALID_TENANT",
+  "claims": ["proxy-admin"]
+}
+```
+
+**Expected Response**
+
+```
+{
+  "error": "Invalid JWT token"
+}
+```
+
+
+
 ## Advanced - Allowed Routes 

 Configure which routes a JWT can access via the config.
@ -331,3 +370,128 @@ general_settings:
    user_allowed_email_domain: "my-co.com" # allows user@my-co.com to call proxy
    user_id_upsert: true # 👈 upserts the user to db, if valid email but not in db
 ```
+
+## [BETA] Control Access with OIDC Roles
+
+Allow JWT tokens with supported roles to access the proxy.
+
+Let users and teams access the proxy, without needing to add them to the DB.
+
+
+Very important, set `enforce_rbac: true` to ensure that the RBAC system is enabled.
+
+**Note:** This is in beta and might change unexpectedly.
+
+```yaml
+general_settings:
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    object_id_jwt_field: "oid" # can be either user / team, inferred from the role mapping
+    roles_jwt_field: "roles"
+    role_mappings:
+      - role: litellm.api.consumer
+        internal_role: "team"
+    enforce_rbac: true # 👈 VERY IMPORTANT
+
+  role_permissions: # default model + endpoint permissions for a role. 
+    - role: team
+      models: ["anthropic-claude"]
+      routes: ["/v1/chat/completions"]
+
+environment_variables:
+  JWT_AUDIENCE: "api://LiteLLM_Proxy" # ensures audience is validated
+```
+
+- `object_id_jwt_field`: The field in the JWT token that contains the object id. This id can be either a user id or a team id. Use this instead of `user_id_jwt_field` and `team_id_jwt_field`. If the same field could be both. 
+
+- `roles_jwt_field`: The field in the JWT token that contains the roles. This field is a list of roles that the user has. To index into a nested field, use dot notation - eg. `resource_access.litellm-test-client-id.roles`.
+
+- `role_mappings`: A list of role mappings. Map the received role in the JWT token to an internal role on LiteLLM.
+
+- `JWT_AUDIENCE`: The audience of the JWT token. This is used to validate the audience of the JWT token. Set via an environment variable.
+
+### Example Token 
+
+```bash
+{
+  "aud": "api://LiteLLM_Proxy",
+  "oid": "eec236bd-0135-4b28-9354-8fc4032d543e",
+  "roles": ["litellm.api.consumer"] 
+}
+```
+
+### Role Mapping Spec 
+
+- `role`: The expected role in the JWT token. 
+- `internal_role`: The internal role on LiteLLM that will be used to control access. 
+
+Supported internal roles:
+- `team`: Team object will be used for RBAC spend tracking. Use this for tracking spend for a 'use case'. 
+- `internal_user`: User object will be used for RBAC spend tracking. Use this for tracking spend for an 'individual user'.
+- `proxy_admin`: Proxy admin will be used for RBAC spend tracking. Use this for granting admin access to a token.
+
+### [Architecture Diagram (Control Model Access)](./jwt_auth_arch)
+
+## [BETA] Control Model Access with Scopes
+
+Control which models a JWT can access. Set `enforce_scope_based_access: true` to enforce scope-based access control.
+
+### 1. Setup config.yaml with scope mappings.
+
+
+```yaml
+model_list:
+  - model_name: anthropic-claude
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: gpt-3.5-turbo-testing
+    litellm_params:
+      model: gpt-3.5-turbo
+      api_key: os.environ/OPENAI_API_KEY
+
+general_settings:
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    team_id_jwt_field: "client_id" # 👈 set the field in the JWT token that contains the team id
+    team_id_upsert: true # 👈 upsert the team to db, if team id is not found in db
+    scope_mappings:
+      - scope: litellm.api.consumer
+        models: ["anthropic-claude"]
+      - scope: litellm.api.gpt_3_5_turbo
+        models: ["gpt-3.5-turbo-testing"]
+    enforce_scope_based_access: true # 👈 enforce scope-based access control
+    enforce_rbac: true # 👈 enforces only a Team/User/ProxyAdmin can access the proxy.
+```
+
+#### Scope Mapping Spec 
+
+- `scope`: The scope to be used for the JWT token.
+- `models`: The models that the JWT token can access. Value is the `model_name` in `model_list`. Note: Wildcard routes are not currently supported.
+
+### 2. Create a JWT with the correct scopes.
+
+Expected Token:
+
+```bash
+{
+  "scope": ["litellm.api.consumer", "litellm.api.gpt_3_5_turbo"] # can be a list or a space-separated string
+}
+```
+
+### 3. Test the flow.
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer eyJhbGci...' \
+-d '{
+  "model": "gpt-3.5-turbo-testing",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hey, how'\''s it going 1234?"
+    }
+  ]
+}'
+```
--- a/docs/my-website/docs/proxy/user_management_heirarchy.md
+++ b/docs/my-website/docs/proxy/user_management_heirarchy.md
@ -1,11 +1,11 @@
 import Image from '@theme/IdealImage';


-# User Management Heirarchy
+# User Management Hierarchy

 <Image img={require('../../img/litellm_user_heirarchy.png')} style={{ width: '100%', maxWidth: '4000px' }} />

-LiteLLM supports a heirarchy of users, teams, organizations, and budgets.
+LiteLLM supports a hierarchy of users, teams, organizations, and budgets.

 - Organizations can have multiple teams. [API Reference](https://litellm-api.up.railway.app/#/organization%20management)
 - Teams can have multiple users. [API Reference](https://litellm-api.up.railway.app/#/team%20management)
--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -0,0 +1,357 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 'Thinking' / 'Reasoning Content'
+
+Supported Providers:
+- Deepseek (`deepseek/`)
+- Anthropic API (`anthropic/`)
+- Bedrock (Anthropic + Deepseek) (`bedrock/`)
+- Vertex AI (Anthropic) (`vertexai/`)
+
+```python
+"message": {
+    ...
+    "reasoning_content": "The capital of France is Paris.",
+    "thinking_blocks": [
+        {
+            "type": "thinking",
+            "thinking": "The capital of France is Paris.",
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
+        }
+    ]
+}
+```
+
+## Quick Start 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os 
+
+os.environ["ANTHROPIC_API_KEY"] = ""
+
+response = completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[
+    {"role": "user", "content": "What is the capital of France?"},
+  ],
+  thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`)
+)
+print(response.choices[0].message.content)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "anthropic/claude-3-7-sonnet-20250219",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What is the capital of France?"
+      }
+    ],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+}'
+```
+</TabItem>
+</Tabs>
+
+**Expected Response**
+
+```bash
+{
+    "id": "3b66124d79a708e10c603496b363574c",
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": " won the FIFA World Cup in 2022.",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "created": 1723323084,
+    "model": "deepseek/deepseek-chat",
+    "object": "chat.completion",
+    "system_fingerprint": "fp_7e0991cad4",
+    "usage": {
+        "completion_tokens": 12,
+        "prompt_tokens": 16,
+        "total_tokens": 28,
+    },
+    "service_tier": null
+}
+```
+
+## Tool Calling with `thinking`
+
+Here's how to use `thinking` blocks by Anthropic with tool calling.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+litellm._turn_on_debug()
+litellm.modify_params = True
+model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
+    }
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model=model,
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+print("Expecting there to be 3 tool calls")
+assert (
+    len(tool_calls) > 0
+)  # this has to call the function for SF, Tokyo and paris
+
+# Step 2: check if the model wanted to call a function
+print(f"tool_calls: {tool_calls}")
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }  # only one function in this example, but you can have multiple
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        if function_name not in available_functions:
+            # the model called a function that does not exist in available_functions - don't try calling anything
+            return
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model=model,
+        messages=messages,
+        seed=22,
+        # tools=tools,
+        drop_params=True,
+        thinking={"type": "enabled", "budget_tokens": 1024},
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: claude-3-7-sonnet-thinking
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-20250219
+      api_key: os.environ/ANTHROPIC_API_KEY
+      thinking: {
+        "type": "enabled",
+        "budget_tokens": 1024
+      }
+```
+
+2. Run proxy
+
+```bash
+litellm --config config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Make 1st call
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
+    ],
+    "tools": [
+        {
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather in a given location",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state",
+                      },
+                      "unit": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                      },
+                  },
+                  "required": ["location"],
+              },
+          },
+        }
+    ],
+    "tool_choice": "auto"
+  }'
+```
+
+4. Make 2nd call with tool call results
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
+      },
+      {
+        "role": "assistant",
+        "content": "I\'ll check the current weather for these three cities for you:",
+        "tool_calls": [
+          {
+            "index": 2,
+            "function": {
+              "arguments": "{\"location\": \"San Francisco\"}",
+              "name": "get_current_weather"
+            },
+            "id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+            "type": "function"
+          }
+        ],
+        "function_call": null,
+        "reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+        "thinking_blocks": [
+          {
+            "type": "thinking",
+            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+            "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
+          }
+        ],
+        "provider_specific_fields": {
+          "reasoningContentBlocks": [
+            {
+              "reasoningText": {
+                "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
+                "text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
+              }
+            }
+          ]
+        }
+      },
+      {
+        "tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+        "role": "tool",
+        "name": "get_current_weather",
+        "content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
+      }
+    ]
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Switching between Anthropic + Deepseek models 
+
+Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
+
+```python
+litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
+
+# or per request
+## Anthropic
+response = litellm.completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+
+## Deepseek
+response = litellm.completion(
+  model="deepseek/deepseek-chat",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+```
+
+## Spec 
+
+
+These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
+
+- `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
+- `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
+  - `type` - str: The type of thinking block.
+  - `thinking` - str: The thinking from the model.
+  - `signature` - str: The signature delta from the model.
+
--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -111,7 +111,7 @@ curl http://0.0.0.0:4000/rerank \

 | Provider    | Link to Usage      |
 |-------------|--------------------|
-| Cohere      |   [Usage](#quick-start)                 |
+| Cohere (v1 + v2 clients)      |   [Usage](#quick-start)                 |
 | Together AI|   [Usage](../docs/providers/togetherai)                 |  
 | Azure AI|   [Usage](../docs/providers/azure_ai)                 |  
 | Jina AI|   [Usage](../docs/providers/jina_ai)                 |  
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -826,6 +826,65 @@ asyncio.run(router_acompletion())

 ## Basic Reliability

+### Weighted Deployments 
+
+Set `weight` on a deployment to pick one deployment more often than others. 
+
+This works across **ALL** routing strategies. 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import Router 
+
+model_list = [
+	{
+		"model_name": "o1",
+		"litellm_params": {
+			"model": "o1-preview", 
+			"api_key": os.getenv("OPENAI_API_KEY"), 
+			"weight": 1
+		},
+	},
+	{
+		"model_name": "o1",
+		"litellm_params": {
+			"model": "o1-preview", 
+			"api_key": os.getenv("OPENAI_API_KEY"), 
+			"weight": 2 # 👈 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview
+		},
+	},
+]
+
+router = Router(model_list=model_list, routing_strategy="cost-based-routing")
+
+response = await router.acompletion(
+	model="gpt-3.5-turbo", 
+	messages=[{"role": "user", "content": "Hey, how's it going?"}]
+)
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+model_list:
+  - model_name: o1
+  	litellm_params:
+		model: o1
+		api_key: os.environ/OPENAI_API_KEY
+		weight: 1	
+  - model_name: o1
+    litellm_params:
+		model: o1-preview
+		api_key: os.environ/OPENAI_API_KEY
+		weight: 2 # 👈 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview
+```
+
+</TabItem>
+</Tabs>
+
 ### Max Parallel Requests (ASYNC)

 Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios. 
@ -893,8 +952,8 @@ router_settings:
 ```

 Defaults:
- allowed_fails: 0
- cooldown_time: 60s 
+- allowed_fails: 3
+- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)

 **Set Per Model**

--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
 ```


+### Using K/V pairs in 1 AWS Secret
+
+You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
+
+```yaml
+general_settings:
+  key_management_system: "aws_secret_manager"
+  key_management_settings:
+    hosted_keys: [
+      "OPENAI_API_KEY_MODEL_1",
+      "OPENAI_API_KEY_MODEL_2",
+    ]
+    primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
+```
+
+The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
+
+```json
+{
+  "OPENAI_API_KEY_MODEL_1": "sk-key1...",
+  "OPENAI_API_KEY_MODEL_2": "sk-key2..."
+}
+```
+
+This reduces the number of AWS Secrets you need to manage.
+
+
 ## Hashicorp Vault


@ -353,4 +380,7 @@ general_settings:
    
    # Hosted Keys Settings
    hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
+
+    # K/V pairs in 1 AWS Secret Settings
+    primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
 ```
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -30,6 +30,7 @@ import os
 # Set OpenAI API key
 os.environ["OPENAI_API_KEY"] = "Your API Key"
 os.environ["ANTHROPIC_API_KEY"] = "Your API Key"
+os.environ["XAI_API_KEY"] = "Your API Key"
 os.environ["REPLICATE_API_KEY"] = "Your API Key"
 os.environ["TOGETHERAI_API_KEY"] = "Your API Key"
 ```
--- a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
+++ b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Use LiteLLM AI Gateway with Aporia Guardrails
+# Aporia Guardrails with LiteLLM Gateway

-In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
+In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses

 ## 1. Setup guardrails on Aporia

--- a/docs/my-website/docs/tutorials/openweb_ui.md
+++ b/docs/my-website/docs/tutorials/openweb_ui.md
@ -0,0 +1,103 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# OpenWeb UI with LiteLLM
+
+This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to 
+- Access 100+ LLMs on OpenWeb UI
+- Track Spend / Usage, Set Budget Limits 
+- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
+- Set access controls eg. Control what models OpenWebUI can access.
+
+## Quickstart
+
+- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
+
+
+## 1. Start LiteLLM & OpenWebUI
+
+- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
+- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
+
+
+## 2. Create a Virtual Key on LiteLLM
+
+Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
+
+### 2.1 LiteLLM User Management Hierarchy
+
+On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
+
+- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
+- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
+- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
+- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
+
+Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
+
+### 2.2 Create a Team on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
+
+<Image img={require('../../img/litellm_create_team.gif')} />
+
+### 2.2 Create a Virtual Key on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key. 
+
+LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
+
+<Image img={require('../../img/create_key_in_team_oweb.gif')} />
+
+## 3. Connect OpenWeb UI to LiteLLM
+
+On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
+
+Enter the following details:
+- URL: `http://localhost:4000` (your litellm proxy base url)
+- Key: `your-virtual-key` (the key you created in the previous step)
+
+<Image img={require('../../img/litellm_setup_openweb.gif')} />
+
+### 3.1 Test Request
+
+On the top left corner, select models you should only see the models you gave the key access to in Step 2.
+
+Once you selected a model, enter your message content and click on `Submit`
+
+<Image img={require('../../img/basic_litellm.gif')} />
+
+### 3.2 Tracking Spend / Usage
+
+After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
+
+<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
+
+
+
+## Render `thinking` content on OpenWeb UI
+
+OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
+
+Example litellm config.yaml:
+
+```yaml
+model_list:
+  - model_name: thinking-anthropic-claude-3-7-sonnet
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      thinking: {"type": "enabled", "budget_tokens": 1024}
+      max_tokens: 1080
+      merge_reasoning_content_in_choices: true
+```
+
+### Test it on OpenWeb UI
+
+On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
+
+<Image img={require('../../img/litellm_thinking_openweb.gif')} />
+
+
+
+
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -44,7 +44,7 @@ const config = {
        path: './release_notes',
        routeBasePath: 'release_notes',
        blogTitle: 'Release Notes',
-        blogSidebarTitle: 'All Releases',
+        blogSidebarTitle: 'Releases',
        blogSidebarCount: 'ALL',
        postsPerPage: 'ALL',
        showReadingTime: false,
--- a/docs/my-website/img/basic_litellm.gif
+++ b/docs/my-website/img/basic_litellm.gif
--- a/docs/my-website/img/create_key_in_team_oweb.gif
+++ b/docs/my-website/img/create_key_in_team_oweb.gif
--- a/docs/my-website/img/litellm_create_team.gif
+++ b/docs/my-website/img/litellm_create_team.gif
--- a/docs/my-website/img/litellm_setup_openweb.gif
+++ b/docs/my-website/img/litellm_setup_openweb.gif
--- a/docs/my-website/img/litellm_thinking_openweb.gif
+++ b/docs/my-website/img/litellm_thinking_openweb.gif
--- a/docs/my-website/img/litellm_user_heirarchy.png
+++ b/docs/my-website/img/litellm_user_heirarchy.png
--- a/docs/my-website/img/message_redaction_logging.png
+++ b/docs/my-website/img/message_redaction_logging.png
--- a/docs/my-website/img/message_redaction_spend_logs.png
+++ b/docs/my-website/img/message_redaction_spend_logs.png
--- a/docs/my-website/img/release_notes/anthropic_thinking.jpg
+++ b/docs/my-website/img/release_notes/anthropic_thinking.jpg
--- a/docs/my-website/img/release_notes/error_logs.jpg
+++ b/docs/my-website/img/release_notes/error_logs.jpg
--- a/docs/my-website/img/release_notes/v1632_release.jpg
+++ b/docs/my-website/img/release_notes/v1632_release.jpg
--- a/docs/my-website/release_notes/v1.57.8-stable/index.md
+++ b/docs/my-website/release_notes/v1.57.8-stable/index.md
@ -18,13 +18,6 @@ hide_table_of_contents: false
 `alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`


-:::note
-
-v1.57.8-stable, is currently being tested. It will be released on 2025-01-12. 
-
-:::
-
-
 ## New / Updated Models

 1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452
--- a/docs/my-website/release_notes/v1.61.20-stable/index.md
+++ b/docs/my-website/release_notes/v1.61.20-stable/index.md
@ -0,0 +1,103 @@
+---
+title: v1.61.20-stable
+slug: v1.61.20-stable
+date: 2025-03-01T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+# v1.61.20-stable
+
+
+These are the changes since `v1.61.13-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
+- UI improvements (add model flow, user management, etc)
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+## New Models / Updated Models
+
+1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter) 
+    1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
+    2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+    3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
+    4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
+2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
+3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
+4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
+5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
+6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
+
+## LLM Translation
+
+1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
+2. Amazon Deepseek - `<think>` param extraction into ‘reasoning_content’ [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
+3. Amazon Titan Embeddings - filter out ‘aws_’ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
+4. Anthropic ‘thinking’ + ‘reasoning_content’ translation support (Anthropic API, Bedrock, Vertex AI)  [Start here](https://docs.litellm.ai/docs/reasoning_content)
+5. VLLM - support ‘video_url’ [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
+6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
+7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
+9. O1/O3 - support ‘drop_params’ for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
+
+## Spend Tracking Improvements
+
+1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
+2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
+3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
+
+## Management Endpoints / UI
+
+1. Models Page - Allow sorting models by ‘created at’
+2. Models Page - Edit Model Flow Improvements
+3. Models Page - Fix Adding Azure, Azure AI Studio models on UI 
+4. Internal Users Page - Allow Bulk Adding Internal Users on UI 
+5. Internal Users Page - Allow sorting users by ‘created at’ 
+6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
+7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
+8. Model Hub Page  - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
+9. Admin Settings Page - Allow adding MSFT SSO on UI 
+10. Backend - don't allow creating duplicate internal users in DB
+
+## Helm
+
+1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
+2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
+
+## Logging / Guardrail Integrations
+
+1. Arize Phoenix support 
+2. ‘No-log’ - fix ‘no-log’ param support on embedding calls 
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
+
+## General Proxy Improvements
+
+1. Hypercorn - fix reading / parsing request body 
+2. Windows - fix running proxy in windows 
+3. DD-Trace - fix dd-trace enablement on proxy
+
+## Complete Git Diff
+
+View the complete git diff [here](https://github.com/BerriAI/litellm/compare/v1.61.13-stable...v1.61.20-stable).
--- a/docs/my-website/release_notes/v1.63.0/index.md
+++ b/docs/my-website/release_notes/v1.63.0/index.md
@ -0,0 +1,40 @@
+---
+title: v1.63.0 - Anthropic 'thinking' response update
+slug: v1.63.0
+date: 2025-03-05T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
+
+
+
+It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
+
+
+## Diff 
+
+```bash
+"message": {
+    ...
+    "reasoning_content": "The capital of France is Paris.",
+    "thinking_blocks": [
+        {
+            "type": "thinking",
+            "thinking": "The capital of France is Paris.",
+-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
+        }
+    ]
+}
+```
--- a/docs/my-website/release_notes/v1.63.2-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.2-stable/index.md
@ -0,0 +1,112 @@
+---
+title: v1.63.2-stable
+slug: v1.63.2-stable
+date: 2025-03-08T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+
+These are the changes since `v1.61.20-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (more `thinking` content improvements)
+- UI improvements (Error logs now shown on UI)
+
+
+:::info
+
+This release will be live on 03/09/2025
+
+::: 
+
+<Image img={require('../../img/release_notes/v1632_release.jpg')} />
+
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+
+## New Models / Updated Models
+
+1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
+2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
+3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
+
+## LLM Translation
+
+<Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
+
+1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
+3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
+6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
+7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
+8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
+9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+- Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
+11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
+12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
+- Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API. 
+13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
+
+## Spend Tracking Improvements
+
+1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
+2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
+
+## Management Endpoints / UI
+
+<Image img={require('../../img/release_notes/error_logs.jpg')} />
+
+1. Virtual Keys Page
+    - Allow team/org filters to be searchable on the Create Key Page
+    - Add created_by and updated_by fields to Keys table
+    - Show 'user_email' on key table
+    - Show 100 Keys Per Page, Use full height, increase width of key alias
+2. Logs Page
+    - Show Error Logs on LiteLLM UI
+    - Allow Internal Users to View their own logs
+3. Internal Users Page 
+    - Allow admin to control default model access for internal users
+7. Fix session handling with cookies
+
+## Logging / Guardrail Integrations
+
+1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
+2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
+
+## General Proxy Improvements
+
+1. Raise BadRequestError when unknown model passed in request
+2. Enforce model access restrictions on Azure OpenAI proxy route
+3. Reliability fix - Handle emoji’s in text - fix orjson error
+4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
+5. Enable setting timezone information in docker image 
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -41,10 +41,12 @@ const sidebars = {
            "proxy/deploy",
            "proxy/prod",
            "proxy/cli",
+            "proxy/release_cycle",
            "proxy/model_management",
            "proxy/health",
            "proxy/debugging",
            "proxy/spending_monitoring",
+            "proxy/master_key_rotations",
          ],
        },
        "proxy/demo",
@ -65,8 +67,8 @@ const sidebars = {
          items: [
            "proxy/user_keys",
            "proxy/clientside_auth",
-            "proxy/response_headers",
            "proxy/request_headers",
+            "proxy/response_headers",
          ],
        },
        {
@ -162,7 +164,6 @@ const sidebars = {
          ]
        },
        "proxy/caching",
-
      ]
    },
    {
@ -181,6 +182,7 @@ const sidebars = {
        "providers/openai_compatible",
        "providers/azure",
        "providers/azure_ai",
+        "providers/aiml",
        "providers/vertex",
        "providers/gemini",
        "providers/anthropic",
@ -242,6 +244,7 @@ const sidebars = {
        "completion/document_understanding",
        "completion/vision",
        "completion/json_mode",
+        "reasoning_content",
        "completion/prompt_caching",
        "completion/predict_outputs",
        "completion/prefix",
@ -254,13 +257,19 @@ const sidebars = {
        "completion/batching",
        "completion/mock_requests",
        "completion/reliable_completions",
-        'tutorials/litellm_proxy_aporia',

      ]
    },
    {
      type: "category",
      label: "Supported Endpoints",
+      link: {
+        type: "generated-index",
+        title: "Supported Endpoints",
+        description:
+          "Learn how to deploy + call models from different providers on LiteLLM",
+        slug: "/supported_endpoints",
+      },
      items: [
        {
          type: "category",
@ -279,6 +288,7 @@ const sidebars = {
        },
        "text_completion",
        "embedding/supported_embedding",
+        "anthropic_unified",
        {
          type: "category",
          label: "Image",
@ -303,6 +313,7 @@ const sidebars = {
            "pass_through/vertex_ai",
            "pass_through/google_ai_studio",
            "pass_through/cohere",
+            "pass_through/openai_passthrough",
            "pass_through/anthropic_completion",
            "pass_through/bedrock",
            "pass_through/assembly_ai",
@ -347,23 +358,6 @@ const sidebars = {
          label: "LangChain, LlamaIndex, Instructor Integration",
          items: ["langchain/langchain", "tutorials/instructor"],
        },
-        {
-          type: "category",
-          label: "Tutorials",
-          items: [
-
-            'tutorials/azure_openai',
-            'tutorials/instructor',
-            "tutorials/gradio_integration",
-            "tutorials/huggingface_codellama",
-            "tutorials/huggingface_tutorial",
-            "tutorials/TogetherAI_liteLLM",
-            "tutorials/finetuned_chat_gpt",
-            "tutorials/text_completion",
-            "tutorials/first_playground",
-            "tutorials/model_fallbacks",
-          ],
-        },
      ],
    },
    {
@ -380,13 +374,6 @@ const sidebars = {
        "load_test_rpm",
      ]
    },
-    {
-      type: "category",
-      label: "Adding Providers",
-      items: [
-        "adding_provider/directory_structure",
-        "adding_provider/new_rerank_provider"],
-    },
    {
      type: "category",
      label: "Logging & Observability",
@ -421,12 +408,51 @@ const sidebars = {
        "observability/opik_integration",
      ],
    },
+    {
+      type: "category",
+      label: "Tutorials",
+      items: [
+        "tutorials/openweb_ui",
+        'tutorials/litellm_proxy_aporia',
+        {
+          type: "category",
+          label: "LiteLLM Python SDK Tutorials",
+          items: [

+            'tutorials/azure_openai',
+            'tutorials/instructor',
+            "tutorials/gradio_integration",
+            "tutorials/huggingface_codellama",
+            "tutorials/huggingface_tutorial",
+            "tutorials/TogetherAI_liteLLM",
+            "tutorials/finetuned_chat_gpt",
+            "tutorials/text_completion",
+            "tutorials/first_playground",
+            "tutorials/model_fallbacks",
+          ],
+        },
+      ]
+    },
+    {
+      type: "category",
+      label: "Contributing",
+      items: [
+        "extras/contributing_code",
+        {
+          type: "category",
+          label: "Adding Providers",
+          items: [
+            "adding_provider/directory_structure",
+            "adding_provider/new_rerank_provider"],
+        },
+        "extras/contributing",
+        "contributing",
+      ]
+    },
    {
      type: "category",
      label: "Extras",
      items: [
-        "extras/contributing",
        "data_security",
        "data_retention",
        "migration_policy",
@ -443,7 +469,9 @@ const sidebars = {
          items: [
            "projects/smolagents",
            "projects/Docq.AI",
+            "projects/PDL",
            "projects/OpenInterpreter",
+            "projects/Elroy",
            "projects/dbally",
            "projects/FastREPL",
            "projects/PROMPTMETHEUS",
@ -457,9 +485,9 @@ const sidebars = {
            "projects/YiVal",
            "projects/LiteLLM Proxy",
            "projects/llm_cord",
+            "projects/pgai",
          ],
        },
-        "contributing",
        "proxy/pii_masking",
        "extras/code_quality",
        "rules",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -2,7 +2,7 @@
 import warnings

 warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
-### INIT VARIABLES ######
+### INIT VARIABLES #########
 import threading
 import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
@ -52,6 +52,8 @@ from litellm.constants import (
    open_ai_embedding_models,
    cohere_embedding_models,
    bedrock_embedding_models,
+    known_tokenizer_config,
+    BEDROCK_INVOKE_PROVIDERS_LITERAL,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@ -94,6 +96,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "galileo",
    "braintrust",
    "arize",
+    "arize_phoenix",
    "langtrace",
    "gcs_bucket",
    "azure_storage",
@ -274,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-
-
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@ -359,9 +360,7 @@ BEDROCK_CONVERSE_MODELS = [
    "meta.llama3-2-11b-instruct-v1:0",
    "meta.llama3-2-90b-instruct-v1:0",
 ]
-BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
-    "cohere", "anthropic", "mistral", "amazon", "meta", "llama"
-]
+
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models: List = []
 open_ai_text_completion_models: List = []
@ -398,6 +397,7 @@ gemini_models: List = []
 xai_models: List = []
 deepseek_models: List = []
 azure_ai_models: List = []
+jina_ai_models: List = []
 voyage_models: List = []
 databricks_models: List = []
 cloudflare_models: List = []
@ -411,6 +411,7 @@ anyscale_models: List = []
 cerebras_models: List = []
 galadriel_models: List = []
 sambanova_models: List = []
+assemblyai_models: List = []


 def is_bedrock_pricing_only_model(key: str) -> bool:
@ -560,6 +561,10 @@ def add_known_models():
            galadriel_models.append(key)
        elif value.get("litellm_provider") == "sambanova_models":
            sambanova_models.append(key)
+        elif value.get("litellm_provider") == "assemblyai":
+            assemblyai_models.append(key)
+        elif value.get("litellm_provider") == "jina_ai":
+            jina_ai_models.append(key)


 add_known_models()
@ -631,6 +636,8 @@ model_list = (
    + galadriel_models
    + sambanova_models
    + azure_text_models
+    + assemblyai_models
+    + jina_ai_models
 )

 model_list_set = set(model_list)
@ -684,6 +691,8 @@ models_by_provider: dict = {
    "cerebras": cerebras_models,
    "galadriel": galadriel_models,
    "sambanova": sambanova_models,
+    "assemblyai": assemblyai_models,
+    "jina_ai": jina_ai_models,
 }

 # mapping for those models which have larger equivalents
@ -789,9 +798,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
-from .llms.anthropic.experimental_pass_through.transformation import (
-    AnthropicExperimentalPassThroughConfig,
-)
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -804,10 +810,15 @@ from .llms.predibase.chat.transformation import PredibaseConfig
 from .llms.replicate.chat.transformation import ReplicateConfig
 from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig
 from .llms.cohere.rerank.transformation import CohereRerankConfig
+from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config
 from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
 from .llms.infinity.rerank.transformation import InfinityRerankConfig
+from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
+from .llms.anthropic.experimental_pass_through.messages.transformation import (
+    AnthropicMessagesConfig,
+)
 from .llms.together_ai.chat import TogetherAIConfig
 from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
 from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -853,15 +864,39 @@ from .llms.bedrock.chat.invoke_handler import (
 )

 from .llms.bedrock.common_utils import (
-    AmazonTitanConfig,
-    AmazonAI21Config,
-    AmazonAnthropicConfig,
-    AmazonAnthropicClaude3Config,
-    AmazonCohereConfig,
-    AmazonLlamaConfig,
-    AmazonMistralConfig,
    AmazonBedrockGlobalConfig,
 )
+from .llms.bedrock.chat.invoke_transformations.amazon_ai21_transformation import (
+    AmazonAI21Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_nova_transformation import (
+    AmazonInvokeNovaConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.anthropic_claude2_transformation import (
+    AmazonAnthropicConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.anthropic_claude3_transformation import (
+    AmazonAnthropicClaude3Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_cohere_transformation import (
+    AmazonCohereConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_llama_transformation import (
+    AmazonLlamaConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_deepseek_transformation import (
+    AmazonDeepSeekR1Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_mistral_transformation import (
+    AmazonMistralConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_titan_transformation import (
+    AmazonTitanConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import (
+    AmazonInvokeConfig,
+)
+
 from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
 from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
 from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
@ -974,6 +1009,7 @@ from .assistants.main import *
 from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
+from .llms.anthropic.experimental_pass_through.messages.handler import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -183,7 +183,7 @@ def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:
            )

    verbose_logger.debug(
-        "init_redis_cluster: startup nodes: ", redis_kwargs["startup_nodes"]
+        "init_redis_cluster: startup nodes are being initialized."
    )
    from redis.cluster import ClusterNode

@ -266,7 +266,9 @@ def get_redis_client(**env_overrides):
    return redis.Redis(**redis_kwargs)


-def get_redis_async_client(**env_overrides) -> async_redis.Redis:
+def get_redis_async_client(
+    **env_overrides,
+) -> async_redis.Redis:
    redis_kwargs = _get_redis_client_logic(**env_overrides)
    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
        args = _get_redis_url_kwargs(client=async_redis.Redis.from_url)
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -1,186 +0,0 @@
-# What is this?
-## Translates OpenAI call to Anthropic `/v1/messages` format
-import traceback
-from typing import Any, Optional
-
-import litellm
-from litellm import ChatCompletionRequest, verbose_logger
-from litellm.integrations.custom_logger import CustomLogger
-from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
-from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
-
-
-class AnthropicAdapter(CustomLogger):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def translate_completion_input_params(
-        self, kwargs
-    ) -> Optional[ChatCompletionRequest]:
-        """
-        - translate params, where needed
-        - pass rest, as is
-        """
-        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
-
-        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
-            anthropic_message_request=request_body
-        )
-
-        return translated_body
-
-    def translate_completion_output_params(
-        self, response: ModelResponse
-    ) -> Optional[AnthropicResponse]:
-
-        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
-            response=response
-        )
-
-    def translate_completion_output_params_streaming(
-        self, completion_stream: Any
-    ) -> AdapterCompletionStreamWrapper | None:
-        return AnthropicStreamWrapper(completion_stream=completion_stream)
-
-
-anthropic_adapter = AnthropicAdapter()
-
-
-class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
-    """
-    - first chunk return 'message_start'
-    - content block must be started and stopped
-    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
-    """
-
-    sent_first_chunk: bool = False
-    sent_content_block_start: bool = False
-    sent_content_block_finish: bool = False
-    sent_last_message: bool = False
-    holding_chunk: Optional[Any] = None
-
-    def __next__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-
-            for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except Exception as e:
-            verbose_logger.error(
-                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
-            )
-
-    async def __anext__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-            async for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopAsyncIteration
--- a/litellm/batches/batch_utils.py
+++ b/litellm/batches/batch_utils.py
@ -1,76 +1,16 @@
-import asyncio
-import datetime
 import json
-import threading
-from typing import Any, List, Literal, Optional
+from typing import Any, List, Literal, Tuple

 import litellm
 from litellm._logging import verbose_logger
-from litellm.constants import (
-    BATCH_STATUS_POLL_INTERVAL_SECONDS,
-    BATCH_STATUS_POLL_MAX_ATTEMPTS,
-)
-from litellm.files.main import afile_content
-from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.llms.openai import Batch
-from litellm.types.utils import StandardLoggingPayload, Usage
-
-
-async def batches_async_logging(
-    batch_id: str,
-    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
-    logging_obj: Optional[LiteLLMLoggingObj] = None,
-    **kwargs,
-):
-    """
-    Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
-
-
-    Polls retrieve_batch until it returns a batch with status "completed" or "failed"
-    """
-    from .main import aretrieve_batch
-
-    verbose_logger.debug(
-        ".....in _batches_async_logging... polling retrieve to get batch status"
-    )
-    if logging_obj is None:
-        raise ValueError(
-            "logging_obj is None cannot calculate cost / log batch creation event"
-        )
-    for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
-        try:
-            start_time = datetime.datetime.now()
-            batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
-            verbose_logger.debug(
-                "in _batches_async_logging... batch status= %s", batch.status
-            )
-
-            if batch.status == "completed":
-                end_time = datetime.datetime.now()
-                await _handle_completed_batch(
-                    batch=batch,
-                    custom_llm_provider=custom_llm_provider,
-                    logging_obj=logging_obj,
-                    start_time=start_time,
-                    end_time=end_time,
-                    **kwargs,
-                )
-                break
-            elif batch.status == "failed":
-                pass
-        except Exception as e:
-            verbose_logger.error("error in batches_async_logging", e)
-        await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
+from litellm.types.utils import CallTypes, Usage


 async def _handle_completed_batch(
    batch: Batch,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
-    logging_obj: LiteLLMLoggingObj,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
+) -> Tuple[float, Usage, List[str]]:
    """Helper function to process a completed batch and handle logging"""
    # Get batch results
    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,52 +27,25 @@ async def _handle_completed_batch(
        custom_llm_provider=custom_llm_provider,
    )

-    # Handle logging
-    await _log_completed_batch(
-        logging_obj=logging_obj,
-        batch_usage=batch_usage,
-        batch_cost=batch_cost,
-        start_time=start_time,
-        end_time=end_time,
-        **kwargs,
-    )
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
+
+    return batch_cost, batch_usage, batch_models


-async def _log_completed_batch(
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage: Usage,
-    batch_cost: float,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
-    """Helper function to handle all logging operations for a completed batch"""
-    logging_obj.call_type = "batch_success"
-
-    standard_logging_object = _create_standard_logging_object_for_completed_batch(
-        kwargs=kwargs,
-        start_time=start_time,
-        end_time=end_time,
-        logging_obj=logging_obj,
-        batch_usage_object=batch_usage,
-        response_cost=batch_cost,
-    )
-
-    logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
-
-    # Launch async and sync logging handlers
-    asyncio.create_task(
-        logging_obj.async_success_handler(
-            result=None,
-            start_time=start_time,
-            end_time=end_time,
-            cache_hit=None,
-        )
-    )
-    threading.Thread(
-        target=logging_obj.success_handler,
-        args=(None, start_time, end_time),
-    ).start()
+def _get_batch_models_from_file_content(
+    file_content_dictionary: List[dict],
+) -> List[str]:
+    """
+    Get the models from the file content
+    """
+    batch_models = []
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            _model = _response_body.get("model")
+            if _model:
+                batch_models.append(_model)
+    return batch_models


 async def _batch_cost_calculator(
@ -159,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
    """
    Get the batch output file content as a list of dictionaries
    """
+    from litellm.files.main import afile_content
+
    if custom_llm_provider == "vertex_ai":
        raise ValueError("Vertex AI does not support file content retrieval")

@ -208,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
                total_cost += litellm.completion_cost(
                    completion_response=_response_body,
                    custom_llm_provider=custom_llm_provider,
+                    call_type=CallTypes.aretrieve_batch.value,
                )
                verbose_logger.debug("total_cost=%s", total_cost)
        return total_cost
@ -264,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
    """
    _response: dict = batch_job_output_file.get("response", None) or {}
    return _response.get("status_code", None) == 200
-
-
-def _create_standard_logging_object_for_completed_batch(
-    kwargs: dict,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage_object: Usage,
-    response_cost: float,
-) -> StandardLoggingPayload:
-    """
-    Create a standard logging object for a completed batch
-    """
-    standard_logging_object = logging_obj.model_call_details.get(
-        "standard_logging_object", None
-    )
-
-    if standard_logging_object is None:
-        raise ValueError("unable to create standard logging object for completed batch")
-
-    # Add Completed Batch Job Usage and Response Cost
-    standard_logging_object["call_type"] = "batch_success"
-    standard_logging_object["response_cost"] = response_cost
-    standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
-    standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
-    standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
-    return standard_logging_object
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
    RetrieveBatchRequest,
 )
 from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LiteLLMBatch
 from litellm.utils import client, get_litellm_params, supports_httpx_timeout

-from .batch_utils import batches_async_logging
-
 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
 azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
        else:
            response = init_response

-        # Start async logging job
-        if response is not None:
-            asyncio.create_task(
-                batches_async_logging(
-                    logging_obj=kwargs.get("litellm_logging_obj", None),
-                    batch_id=response.id,
-                    custom_llm_provider=custom_llm_provider,
-                    **kwargs,
-                )
-            )
-
        return response
    except Exception as e:
        raise e
@ -111,7 +99,7 @@ def create_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Creates and executes a batch from an uploaded file of request

@ -119,21 +107,26 @@ def create_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_call_id = kwargs.get("litellm_call_id", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        model_info = kwargs.get("model_info", None)
        _is_async = kwargs.pop("acreate_batch", False) is True
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        litellm_params = get_litellm_params(
-            custom_llm_provider=custom_llm_provider,
-            litellm_call_id=kwargs.get("litellm_call_id", None),
-            litellm_trace_id=kwargs.get("litellm_trace_id"),
-            litellm_metadata=kwargs.get("litellm_metadata"),
-        )
        litellm_logging_obj.update_environment_variables(
            model=None,
            user=None,
            optional_params=optional_params.model_dump(),
-            litellm_params=litellm_params,
+            litellm_params={
+                "litellm_call_id": litellm_call_id,
+                "proxy_server_request": proxy_server_request,
+                "model_info": model_info,
+                "metadata": metadata,
+                "preset_cache_key": None,
+                "stream_response": {},
+                **optional_params.model_dump(exclude_unset=True),
+            },
            custom_llm_provider=custom_llm_provider,
        )

@ -261,7 +254,7 @@ def create_batch(
                response=httpx.Response(
                    status_code=400,
                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
                ),
            )
        return response
@ -269,6 +262,7 @@ def create_batch(
        raise e


+@client
 async def aretrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +270,7 @@ async def aretrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Batch:
+) -> LiteLLMBatch:
    """
    Async: Retrieves a batch.

@ -310,6 +304,7 @@ async def aretrieve_batch(
        raise e


+@client
 def retrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +312,7 @@ def retrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Retrieves a batch.

@ -325,9 +320,23 @@ def retrieve_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+
+        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        # set timeout for 10 minutes by default
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            litellm_call_id=kwargs.get("litellm_call_id", None),
+            litellm_trace_id=kwargs.get("litellm_trace_id"),
+            litellm_metadata=kwargs.get("litellm_metadata"),
+        )
+        litellm_logging_obj.update_environment_variables(
+            model=None,
+            user=None,
+            optional_params=optional_params.model_dump(),
+            litellm_params=litellm_params,
+            custom_llm_provider=custom_llm_provider,
+        )

        if (
            timeout is not None
--- a/litellm/caching/init.py
+++ b/litellm/caching/init.py
@ -4,5 +4,6 @@ from .dual_cache import DualCache
 from .in_memory_cache import InMemoryCache
 from .qdrant_semantic_cache import QdrantSemanticCache
 from .redis_cache import RedisCache
+from .redis_cluster_cache import RedisClusterCache
 from .redis_semantic_cache import RedisSemanticCache
 from .s3_cache import S3Cache
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -13,26 +13,14 @@ import json
 import time
 import traceback
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Union

-from openai.types.audio.transcription_create_params import TranscriptionCreateParams
-from openai.types.chat.completion_create_params import (
-    CompletionCreateParamsNonStreaming,
-    CompletionCreateParamsStreaming,
-)
-from openai.types.completion_create_params import (
-    CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
-)
-from openai.types.completion_create_params import (
-    CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
-)
-from openai.types.embedding_create_params import EmbeddingCreateParams
 from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
-from litellm.types.rerank import RerankRequest
 from litellm.types.utils import all_litellm_params

 from .base_cache import BaseCache
@ -41,6 +29,7 @@ from .dual_cache import DualCache  # noqa
 from .in_memory_cache import InMemoryCache
 from .qdrant_semantic_cache import QdrantSemanticCache
 from .redis_cache import RedisCache
+from .redis_cluster_cache import RedisClusterCache
 from .redis_semantic_cache import RedisSemanticCache
 from .s3_cache import S3Cache

@ -158,7 +147,8 @@ class Cache:
            None. Cache is set as a litellm param
        """
        if type == LiteLLMCacheType.REDIS:
-            self.cache: BaseCache = RedisCache(
+            if redis_startup_nodes:
+                self.cache: BaseCache = RedisClusterCache(
                    host=host,
                    port=port,
                    password=password,
@ -166,6 +156,14 @@ class Cache:
                    startup_nodes=redis_startup_nodes,
                    **kwargs,
                )
+            else:
+                self.cache = RedisCache(
+                    host=host,
+                    port=port,
+                    password=password,
+                    redis_flush_size=redis_flush_size,
+                    **kwargs,
+                )
        elif type == LiteLLMCacheType.REDIS_SEMANTIC:
            self.cache = RedisSemanticCache(
                host=host,
@ -247,7 +245,7 @@ class Cache:
            verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
            return preset_cache_key

-        combined_kwargs = self._get_relevant_args_to_use_for_cache_key()
+        combined_kwargs = ModelParamHelper._get_all_llm_api_params()
        litellm_param_kwargs = all_litellm_params
        for param in kwargs:
            if param in combined_kwargs:
@ -267,9 +265,7 @@ class Cache:

        verbose_logger.debug("\nCreated cache key: %s", cache_key)
        hashed_cache_key = Cache._get_hashed_cache_key(cache_key)
-        hashed_cache_key = self._add_redis_namespace_to_cache_key(
-            hashed_cache_key, **kwargs
-        )
+        hashed_cache_key = self._add_namespace_to_cache_key(hashed_cache_key, **kwargs)
        self._set_preset_cache_key_in_kwargs(
            preset_cache_key=hashed_cache_key, **kwargs
        )
@ -356,76 +352,6 @@ class Cache:
            if "litellm_params" in kwargs:
                kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key

-    def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
-        """
-        Gets the supported kwargs for each call type and combines them
-        """
-        chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
-        text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
-        embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
-        transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
-        rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
-        exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
-
-        combined_kwargs = chat_completion_kwargs.union(
-            text_completion_kwargs,
-            embedding_kwargs,
-            transcription_kwargs,
-            rerank_kwargs,
-        )
-        combined_kwargs = combined_kwargs.difference(exclude_kwargs)
-        return combined_kwargs
-
-    def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported chat completion kwargs
-
-        This follows the OpenAI API Spec
-        """
-        all_chat_completion_kwargs = set(
-            CompletionCreateParamsNonStreaming.__annotations__.keys()
-        ).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
-        return all_chat_completion_kwargs
-
-    def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported text completion kwargs
-
-        This follows the OpenAI API Spec
-        """
-        all_text_completion_kwargs = set(
-            TextCompletionCreateParamsNonStreaming.__annotations__.keys()
-        ).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
-        return all_text_completion_kwargs
-
-    def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported rerank kwargs
-        """
-        return set(RerankRequest.model_fields.keys())
-
-    def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported embedding kwargs
-
-        This follows the OpenAI API Spec
-        """
-        return set(EmbeddingCreateParams.__annotations__.keys())
-
-    def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported transcription kwargs
-
-        This follows the OpenAI API Spec
-        """
-        return set(TranscriptionCreateParams.__annotations__.keys())
-
-    def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
-        """
-        Get the kwargs to exclude from the cache key
-        """
-        return set(["metadata"])
-
    @staticmethod
    def _get_hashed_cache_key(cache_key: str) -> str:
        """
@ -445,7 +371,7 @@ class Cache:
        verbose_logger.debug("Hashed cache key (SHA-256): %s", hash_hex)
        return hash_hex

-    def _add_redis_namespace_to_cache_key(self, hash_hex: str, **kwargs) -> str:
+    def _add_namespace_to_cache_key(self, hash_hex: str, **kwargs) -> str:
        """
        If a redis namespace is provided, add it to the cache key

@ -456,7 +382,12 @@ class Cache:
        Returns:
            str: The final hashed cache key with the redis namespace.
        """
-        namespace = kwargs.get("metadata", {}).get("redis_namespace") or self.namespace
+        dynamic_cache_control: DynamicCacheControl = kwargs.get("cache", {})
+        namespace = (
+            dynamic_cache_control.get("namespace")
+            or kwargs.get("metadata", {}).get("redis_namespace")
+            or self.namespace
+        )
        if namespace:
            hash_hex = f"{namespace}:{hash_hex}"
        verbose_logger.debug("Final hashed key: %s", hash_hex)
@ -536,11 +467,14 @@ class Cache:
            else:
                cache_key = self.get_cache_key(**kwargs)
            if cache_key is not None:
-                cache_control_args = kwargs.get("cache", {})
-                max_age = cache_control_args.get(
-                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+                cache_control_args: DynamicCacheControl = kwargs.get("cache", {})
+                max_age = (
+                    cache_control_args.get("s-maxage")
+                    or cache_control_args.get("s-max-age")
+                    or float("inf")
                )
                cached_result = self.cache.get_cache(cache_key, messages=messages)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                return self._get_cache_logic(
                    cached_result=cached_result, max_age=max_age
                )
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -247,7 +247,6 @@ class LLMCachingHandler:
                    pass
                else:
                    call_type = original_function.__name__
-
                    cached_result = self._convert_cached_result_to_model_response(
                        cached_result=cached_result,
                        call_type=call_type,
@ -725,6 +724,7 @@ class LLMCachingHandler:
        """
        Sync internal method to add the result to the cache
        """
+
        new_kwargs = kwargs.copy()
        new_kwargs.update(
            convert_args_to_kwargs(
@ -738,6 +738,7 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
+
            litellm.cache.add_cache(result, **new_kwargs)

        return
--- a/litellm/caching/redis_cache.py
+++ b/litellm/caching/redis_cache.py
@ -14,7 +14,7 @@ import inspect
 import json
 import time
 from datetime import timedelta
-from typing import TYPE_CHECKING, Any, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union

 import litellm
 from litellm._logging import print_verbose, verbose_logger
@ -26,15 +26,20 @@ from .base_cache import BaseCache

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
-    from redis.asyncio import Redis
+    from redis.asyncio import Redis, RedisCluster
    from redis.asyncio.client import Pipeline
+    from redis.asyncio.cluster import ClusterPipeline

    pipeline = Pipeline
+    cluster_pipeline = ClusterPipeline
    async_redis_client = Redis
+    async_redis_cluster_client = RedisCluster
    Span = _Span
 else:
    pipeline = Any
+    cluster_pipeline = Any
    async_redis_client = Any
+    async_redis_cluster_client = Any
    Span = Any


@ -75,6 +80,7 @@ class RedisCache(BaseCache):

        redis_kwargs.update(kwargs)
        self.redis_client = get_redis_client(**redis_kwargs)
+        self.redis_async_client: Optional[async_redis_client] = None
        self.redis_kwargs = redis_kwargs
        self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)

@ -122,12 +128,16 @@ class RedisCache(BaseCache):
        else:
            super().__init__()  # defaults to 60s

-    def init_async_client(self):
+    def init_async_client(
+        self,
+    ) -> Union[async_redis_client, async_redis_cluster_client]:
        from .._redis import get_redis_async_client

-        return get_redis_async_client(
+        if self.redis_async_client is None:
+            self.redis_async_client = get_redis_async_client(
                connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
            )
+        return self.redis_async_client

    def check_and_fix_namespace(self, key: str) -> str:
        """
@ -227,10 +237,7 @@ class RedisCache(BaseCache):
            keys = []
            _redis_client: Redis = self.init_async_client()  # type: ignore

-            async with _redis_client as redis_client:
-                async for key in redis_client.scan_iter(
-                    match=pattern + "*", count=count
-                ):
+            async for key in _redis_client.scan_iter(match=pattern + "*", count=count):
                keys.append(key)
                if len(keys) >= count:
                    break
@ -285,7 +292,6 @@ class RedisCache(BaseCache):
                    call_type="async_set_cache",
                )
            )
-            # NON blocking - notify users Redis is throwing an exception
            verbose_logger.error(
                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
                str(e),
@ -294,18 +300,13 @@ class RedisCache(BaseCache):
            raise e

        key = self.check_and_fix_namespace(key=key)
-        async with _redis_client as redis_client:
        ttl = self.get_ttl(**kwargs)
-            print_verbose(
-                f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
-            )
+        print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")

        try:
-                if not hasattr(redis_client, "set"):
-                    raise Exception(
-                        "Redis client cannot set cache. Attribute not found."
-                    )
-                await redis_client.set(name=key, value=json.dumps(value), ex=ttl)
+            if not hasattr(_redis_client, "set"):
+                raise Exception("Redis client cannot set cache. Attribute not found.")
+            await _redis_client.set(name=key, value=json.dumps(value), ex=ttl)
            print_verbose(
                f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
            )
@ -337,7 +338,6 @@ class RedisCache(BaseCache):
                    event_metadata={"key": key},
                )
            )
-                # NON blocking - notify users Redis is throwing an exception
            verbose_logger.error(
                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
                str(e),
@ -345,8 +345,14 @@ class RedisCache(BaseCache):
            )

    async def _pipeline_helper(
-        self, pipe: pipeline, cache_list: List[Tuple[Any, Any]], ttl: Optional[float]
+        self,
+        pipe: Union[pipeline, cluster_pipeline],
+        cache_list: List[Tuple[Any, Any]],
+        ttl: Optional[float],
    ) -> List:
+        """
+        Helper function for executing a pipeline of set operations on Redis
+        """
        ttl = self.get_ttl(ttl=ttl)
        # Iterate through each key-value pair in the cache_list and set them in the pipeline.
        for cache_key, cache_value in cache_list:
@ -359,7 +365,11 @@ class RedisCache(BaseCache):
            _td: Optional[timedelta] = None
            if ttl is not None:
                _td = timedelta(seconds=ttl)
-            pipe.set(cache_key, json_cache_value, ex=_td)
+            pipe.set(  # type: ignore
+                name=cache_key,
+                value=json_cache_value,
+                ex=_td,
+            )
        # Execute the pipeline and return the results.
        results = await pipe.execute()
        return results
@ -373,9 +383,8 @@ class RedisCache(BaseCache):
        # don't waste a network request if there's nothing to set
        if len(cache_list) == 0:
            return
-        from redis.asyncio import Redis

-        _redis_client: Redis = self.init_async_client()  # type: ignore
+        _redis_client = self.init_async_client()
        start_time = time.time()

        print_verbose(
@ -383,8 +392,7 @@ class RedisCache(BaseCache):
        )
        cache_value: Any = None
        try:
-            async with _redis_client as redis_client:
-                async with redis_client.pipeline(transaction=True) as pipe:
+            async with _redis_client.pipeline(transaction=False) as pipe:
                results = await self._pipeline_helper(pipe, cache_list, ttl)

            print_verbose(f"pipeline results: {results}")
@ -473,13 +481,10 @@ class RedisCache(BaseCache):
            raise e

        key = self.check_and_fix_namespace(key=key)
-        async with _redis_client as redis_client:
-            print_verbose(
-                f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
-            )
+        print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
        try:
            await self._set_cache_sadd_helper(
-                    redis_client=redis_client, key=key, value=value, ttl=ttl
+                redis_client=_redis_client, key=key, value=value, ttl=ttl
            )
            print_verbose(
                f"Successfully Set ASYNC Redis Cache SADD: key: {key}\nValue {value}\nttl={ttl}"
@ -538,16 +543,15 @@ class RedisCache(BaseCache):
        _redis_client: Redis = self.init_async_client()  # type: ignore
        start_time = time.time()
        _used_ttl = self.get_ttl(ttl=ttl)
+        key = self.check_and_fix_namespace(key=key)
        try:
-            async with _redis_client as redis_client:
-                result = await redis_client.incrbyfloat(name=key, amount=value)
-
+            result = await _redis_client.incrbyfloat(name=key, amount=value)
            if _used_ttl is not None:
                # check if key already has ttl, if not -> set ttl
-                    current_ttl = await redis_client.ttl(key)
+                current_ttl = await _redis_client.ttl(key)
                if current_ttl == -1:
                    # Key has no expiration
-                        await redis_client.expire(key, _used_ttl)
+                    await _redis_client.expire(key, _used_ttl)

            ## LOGGING ##
            end_time = time.time()
@ -634,19 +638,48 @@ class RedisCache(BaseCache):
                "litellm.caching.caching: get() - Got exception from REDIS: ", e
            )

-    def batch_get_cache(self, key_list, parent_otel_span: Optional[Span]) -> dict:
+    def _run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Wrapper to call `mget` on the redis client
+
+        We use a wrapper so RedisCluster can override this method
+        """
+        return self.redis_client.mget(keys=keys)  # type: ignore
+
+    async def _async_run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Wrapper to call `mget` on the redis client
+
+        We use a wrapper so RedisCluster can override this method
+        """
+        async_redis_client = self.init_async_client()
+        return await async_redis_client.mget(keys=keys)  # type: ignore
+
+    def batch_get_cache(
+        self,
+        key_list: Union[List[str], List[Optional[str]]],
+        parent_otel_span: Optional[Span] = None,
+    ) -> dict:
        """
        Use Redis for bulk read operations
+
+        Args:
+            key_list: List of keys to get from Redis
+            parent_otel_span: Optional parent OpenTelemetry span
+
+        Returns:
+            dict: A dictionary mapping keys to their cached values
        """
        key_value_dict = {}
+        _key_list = [key for key in key_list if key is not None]

        try:
            _keys = []
-            for cache_key in key_list:
-                cache_key = self.check_and_fix_namespace(key=cache_key)
+            for cache_key in _key_list:
+                cache_key = self.check_and_fix_namespace(key=cache_key or "")
                _keys.append(cache_key)
            start_time = time.time()
-            results: List = self.redis_client.mget(keys=_keys)  # type: ignore
+            results: List = self._run_redis_mget_operation(keys=_keys)
            end_time = time.time()
            _duration = end_time - start_time
            self.service_logger_obj.service_success_hook(
@ -659,17 +692,19 @@ class RedisCache(BaseCache):
            )

            # Associate the results back with their keys.
-            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
-            key_value_dict = dict(zip(key_list, results))
+            # 'results' is a list of values corresponding to the order of keys in '_key_list'.
+            key_value_dict = dict(zip(_key_list, results))

-            decoded_results = {
-                k.decode("utf-8"): self._get_cache_logic(v)
-                for k, v in key_value_dict.items()
-            }
+            decoded_results = {}
+            for k, v in key_value_dict.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8")
+                v = self._get_cache_logic(v)
+                decoded_results[k] = v

            return decoded_results
        except Exception as e:
-            print_verbose(f"Error occurred in pipeline read - {str(e)}")
+            verbose_logger.error(f"Error occurred in batch get cache - {str(e)}")
            return key_value_dict

    async def async_get_cache(
@ -680,15 +715,15 @@ class RedisCache(BaseCache):
        _redis_client: Redis = self.init_async_client()  # type: ignore
        key = self.check_and_fix_namespace(key=key)
        start_time = time.time()
-        async with _redis_client as redis_client:
+
        try:
            print_verbose(f"Get Async Redis Cache: key: {key}")
-                cached_response = await redis_client.get(key)
+            cached_response = await _redis_client.get(key)
            print_verbose(
                f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
            )
            response = self._get_cache_logic(cached_response=cached_response)
-                ## LOGGING ##
+
            end_time = time.time()
            _duration = end_time - start_time
            asyncio.create_task(
@ -704,7 +739,6 @@ class RedisCache(BaseCache):
            )
            return response
        except Exception as e:
-                ## LOGGING ##
            end_time = time.time()
            _duration = end_time - start_time
            asyncio.create_task(
@ -719,28 +753,37 @@ class RedisCache(BaseCache):
                    event_metadata={"key": key},
                )
            )
-                # NON blocking - notify users Redis is throwing an exception
            print_verbose(
                f"litellm.caching.caching: async get() - Got exception from REDIS: {str(e)}"
            )

    async def async_batch_get_cache(
-        self, key_list: List[str], parent_otel_span: Optional[Span] = None
+        self,
+        key_list: Union[List[str], List[Optional[str]]],
+        parent_otel_span: Optional[Span] = None,
    ) -> dict:
        """
        Use Redis for bulk read operations
+
+        Args:
+            key_list: List of keys to get from Redis
+            parent_otel_span: Optional parent OpenTelemetry span
+
+        Returns:
+            dict: A dictionary mapping keys to their cached values
+
+        `.mget` does not support None keys. This will filter out None keys.
        """
-        _redis_client = await self.init_async_client()
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `mget`
        key_value_dict = {}
        start_time = time.time()
+        _key_list = [key for key in key_list if key is not None]
        try:
-            async with _redis_client as redis_client:
            _keys = []
-                for cache_key in key_list:
+            for cache_key in _key_list:
                cache_key = self.check_and_fix_namespace(key=cache_key)
                _keys.append(cache_key)
-                results = await redis_client.mget(keys=_keys)
-
+            results = await self._async_run_redis_mget_operation(keys=_keys)
            ## LOGGING ##
            end_time = time.time()
            _duration = end_time - start_time
@ -757,7 +800,7 @@ class RedisCache(BaseCache):

            # Associate the results back with their keys.
            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
-            key_value_dict = dict(zip(key_list, results))
+            key_value_dict = dict(zip(_key_list, results))

            decoded_results = {}
            for k, v in key_value_dict.items():
@ -782,7 +825,7 @@ class RedisCache(BaseCache):
                    parent_otel_span=parent_otel_span,
                )
            )
-            print_verbose(f"Error occurred in pipeline read - {str(e)}")
+            verbose_logger.error(f"Error occurred in async batch get cache - {str(e)}")
            return key_value_dict

    def sync_ping(self) -> bool:
@ -822,12 +865,12 @@ class RedisCache(BaseCache):
            raise e

    async def ping(self) -> bool:
-        _redis_client = self.init_async_client()
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `ping`
+        _redis_client: Any = self.init_async_client()
        start_time = time.time()
-        async with _redis_client as redis_client:
        print_verbose("Pinging Async Redis Cache")
        try:
-                response = await redis_client.ping()
+            response = await _redis_client.ping()
            ## LOGGING ##
            end_time = time.time()
            _duration = end_time - start_time
@ -858,10 +901,10 @@ class RedisCache(BaseCache):
            raise e

    async def delete_cache_keys(self, keys):
-        _redis_client = self.init_async_client()
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
+        _redis_client: Any = self.init_async_client()
        # keys is a list, unpack it so it gets passed as individual elements to delete
-        async with _redis_client as redis_client:
-            await redis_client.delete(*keys)
+        await _redis_client.delete(*keys)

    def client_list(self) -> List:
        client_list: List = self.redis_client.client_list()  # type: ignore
@ -881,10 +924,10 @@ class RedisCache(BaseCache):
        await self.async_redis_conn_pool.disconnect(inuse_connections=True)

    async def async_delete_cache(self, key: str):
-        _redis_client = self.init_async_client()
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
+        _redis_client: Any = self.init_async_client()
        # keys is str
-        async with _redis_client as redis_client:
-            await redis_client.delete(key)
+        await _redis_client.delete(key)

    def delete_cache(self, key):
        self.redis_client.delete(key)
@ -935,11 +978,8 @@ class RedisCache(BaseCache):
        )

        try:
-            async with _redis_client as redis_client:
-                async with redis_client.pipeline(transaction=True) as pipe:
-                    results = await self._pipeline_increment_helper(
-                        pipe, increment_list
-                    )
+            async with _redis_client.pipeline(transaction=False) as pipe:
+                results = await self._pipeline_increment_helper(pipe, increment_list)

            print_verbose(f"pipeline increment results: {results}")

@ -991,9 +1031,9 @@ class RedisCache(BaseCache):
        Redis ref: https://redis.io/docs/latest/commands/ttl/
        """
        try:
-            _redis_client = await self.init_async_client()
-            async with _redis_client as redis_client:
-                ttl = await redis_client.ttl(key)
+            # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `ttl`
+            _redis_client: Any = self.init_async_client()
+            ttl = await _redis_client.ttl(key)
            if ttl <= -1:  # -1 means the key does not exist, -2 key does not exist
                return None
            return ttl
--- a/litellm/caching/redis_cluster_cache.py
+++ b/litellm/caching/redis_cluster_cache.py
@ -0,0 +1,59 @@
+"""
+Redis Cluster Cache implementation
+
+Key differences:
+- RedisClient NEEDs to be re-used across requests, adds 3000ms latency if it's re-created
+"""
+
+from typing import TYPE_CHECKING, Any, List, Optional
+
+from litellm.caching.redis_cache import RedisCache
+
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    from redis.asyncio import Redis, RedisCluster
+    from redis.asyncio.client import Pipeline
+
+    pipeline = Pipeline
+    async_redis_client = Redis
+    Span = _Span
+else:
+    pipeline = Any
+    async_redis_client = Any
+    Span = Any
+
+
+class RedisClusterCache(RedisCache):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.redis_async_redis_cluster_client: Optional[RedisCluster] = None
+        self.redis_sync_redis_cluster_client: Optional[RedisCluster] = None
+
+    def init_async_client(self):
+        from redis.asyncio import RedisCluster
+
+        from .._redis import get_redis_async_client
+
+        if self.redis_async_redis_cluster_client:
+            return self.redis_async_redis_cluster_client
+
+        _redis_client = get_redis_async_client(
+            connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
+        )
+        if isinstance(_redis_client, RedisCluster):
+            self.redis_async_redis_cluster_client = _redis_client
+
+        return _redis_client
+
+    def _run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Overrides `_run_redis_mget_operation` in redis_cache.py
+        """
+        return self.redis_client.mget_nonatomic(keys=keys)  # type: ignore
+
+    async def _async_run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Overrides `_async_run_redis_mget_operation` in redis_cache.py
+        """
+        async_redis_cluster_client = self.init_async_client()
+        return await async_redis_cluster_client.mget_nonatomic(keys=keys)  # type: ignore
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Literal

 ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
@ -120,6 +120,7 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
    "top_logprobs",
    "reasoning_effort",
    "extra_headers",
+    "thinking",
 ]

 openai_compatible_endpoints: List = [
@ -319,6 +320,17 @@ baseten_models: List = [
    "31dxrj3",
 ]  # FALCON 7B  # WizardLM  # Mosaic ML

+BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
+    "cohere",
+    "anthropic",
+    "mistral",
+    "amazon",
+    "meta",
+    "llama",
+    "ai21",
+    "nova",
+    "deepseek_r1",
+]

 open_ai_embedding_models: List = ["text-embedding-ada-002"]
 cohere_embedding_models: List = [
@ -335,6 +347,63 @@ bedrock_embedding_models: List = [
    "cohere.embed-multilingual-v3",
 ]

+known_tokenizer_config = {
+    "mistralai/Mistral-7B-Instruct-v0.1": {
+        "tokenizer": {
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+        },
+        "status": "success",
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "tokenizer": {
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "",
+        },
+        "status": "success",
+    },
+    "deepseek-r1/deepseek-r1-7b-instruct": {
+        "tokenizer": {
+            "add_bos_token": True,
+            "add_eos_token": False,
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<｜begin▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "clean_up_tokenization_spaces": False,
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<｜end▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "legacy": True,
+            "model_max_length": 16384,
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<｜end▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "sp_model_kwargs": {},
+            "unk_token": None,
+            "tokenizer_class": "LlamaTokenizerFast",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+        },
+        "status": "success",
+    },
+}
+

 OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
 HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60  # 1 minute
@ -368,3 +437,4 @@ BATCH_STATUS_POLL_MAX_ATTEMPTS = 24  # for 24 hours
 HEALTH_CHECK_TIMEOUT_SECONDS = 60  # 60 seconds

 UI_SESSION_TOKEN_TEAM_ID = "litellm-dashboard"
+LITELLM_PROXY_ADMIN_NAME = "default_user_id"
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -16,15 +16,9 @@ from litellm.llms.anthropic.cost_calculation import (
 from litellm.llms.azure.cost_calculation import (
    cost_per_token as azure_openai_cost_per_token,
 )
-from litellm.llms.azure_ai.cost_calculator import (
-    cost_per_query as azure_ai_rerank_cost_per_query,
-)
 from litellm.llms.bedrock.image.cost_calculator import (
    cost_calculator as bedrock_image_cost_calculator,
 )
-from litellm.llms.cohere.cost_calculator import (
-    cost_per_query as cohere_rerank_cost_per_query,
-)
 from litellm.llms.databricks.cost_calculator import (
    cost_per_token as databricks_cost_per_token,
 )
@ -51,10 +45,12 @@ from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
 from litellm.types.llms.openai import HttpxBinaryResponseContent
-from litellm.types.rerank import RerankResponse
+from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
+    LlmProviders,
    LlmProvidersSet,
+    ModelInfo,
    PassthroughCallTypes,
    Usage,
 )
@ -64,6 +60,7 @@ from litellm.utils import (
    EmbeddingResponse,
    ImageResponse,
    ModelResponse,
+    ProviderConfigManager,
    TextCompletionResponse,
    TranscriptionResponse,
    _cached_get_model_info_helper,
@ -114,6 +111,8 @@ def cost_per_token(  # noqa: PLR0915
    number_of_queries: Optional[int] = None,
    ### USAGE OBJECT ###
    usage_object: Optional[Usage] = None,  # just read the usage object if provided
+    ### BILLED UNITS ###
+    rerank_billed_units: Optional[RerankBilledUnits] = None,
    ### CALL TYPE ###
    call_type: CallTypesLiteral = "completion",
    audio_transcription_file_duration: float = 0.0,  # for audio transcription calls - the file time in seconds
@ -238,6 +237,16 @@ def cost_per_token(  # noqa: PLR0915
        return rerank_cost(
            model=model,
            custom_llm_provider=custom_llm_provider,
+            billed_units=rerank_billed_units,
+        )
+    elif (
+        call_type == "aretrieve_batch"
+        or call_type == "retrieve_batch"
+        or call_type == CallTypes.aretrieve_batch
+        or call_type == CallTypes.retrieve_batch
+    ):
+        return batch_cost_calculator(
+            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
@ -399,9 +408,12 @@ def _select_model_name_for_cost_calc(
    if base_model is not None:
        return_model = base_model

-    completion_response_model: Optional[str] = getattr(
-        completion_response, "model", None
-    )
+    completion_response_model: Optional[str] = None
+    if completion_response is not None:
+        if isinstance(completion_response, BaseModel):
+            completion_response_model = getattr(completion_response, "model", None)
+        elif isinstance(completion_response, dict):
+            completion_response_model = completion_response.get("model", None)
    hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
    if completion_response_model is None and hidden_params is not None:
        if (
@ -552,6 +564,7 @@ def completion_cost(  # noqa: PLR0915
        cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
            completion_response=completion_response
        )
+        rerank_billed_units: Optional[RerankBilledUnits] = None
        model = _select_model_name_for_cost_calc(
            model=model,
            completion_response=completion_response,
@ -698,6 +711,11 @@ def completion_cost(  # noqa: PLR0915
                else:
                    billed_units = {}

+                rerank_billed_units = RerankBilledUnits(
+                    search_units=billed_units.get("search_units"),
+                    total_tokens=billed_units.get("total_tokens"),
+                )
+
                search_units = (
                    billed_units.get("search_units") or 1
                )  # cohere charges per request by default.
@ -763,6 +781,7 @@ def completion_cost(  # noqa: PLR0915
            usage_object=cost_per_token_usage_object,
            call_type=call_type,
            audio_transcription_file_duration=audio_transcription_file_duration,
+            rerank_billed_units=rerank_billed_units,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar

@ -836,27 +855,36 @@ def response_cost_calculator(
 def rerank_cost(
    model: str,
    custom_llm_provider: Optional[str],
+    billed_units: Optional[RerankBilledUnits] = None,
 ) -> Tuple[float, float]:
    """
    Returns
    - float or None: cost of response OR none if error.
    """
-    default_num_queries = 1
    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
        model=model, custom_llm_provider=custom_llm_provider
    )

    try:
-        if custom_llm_provider == "cohere":
-            return cohere_rerank_cost_per_query(
-                model=model, num_queries=default_num_queries
+        config = ProviderConfigManager.get_provider_rerank_config(
+            model=model,
+            api_base=None,
+            present_version_params=[],
+            provider=LlmProviders(custom_llm_provider),
        )
-        elif custom_llm_provider == "azure_ai":
-            return azure_ai_rerank_cost_per_query(
-                model=model, num_queries=default_num_queries
+
+        try:
+            model_info: Optional[ModelInfo] = litellm.get_model_info(
+                model=model, custom_llm_provider=custom_llm_provider
            )
-        raise ValueError(
-            f"invalid custom_llm_provider for rerank model: {model}, custom_llm_provider: {custom_llm_provider}"
+        except Exception:
+            model_info = None
+
+        return config.calculate_rerank_cost(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            billed_units=billed_units,
+            model_info=model_info,
        )
    except Exception as e:
        raise e
@ -941,3 +969,54 @@ def default_image_cost_calculator(
            )

    return cost_info["input_cost_per_pixel"] * height * width * n
+
+
+def batch_cost_calculator(
+    usage: Usage,
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+) -> Tuple[float, float]:
+    """
+    Calculate the cost of a batch job
+    """
+
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    verbose_logger.info(
+        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
+        model,
+        custom_llm_provider,
+    )
+
+    try:
+        model_info: Optional[ModelInfo] = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+    except Exception:
+        model_info = None
+
+    if not model_info:
+        return 0.0, 0.0
+
+    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
+    input_cost_per_token = model_info.get("input_cost_per_token")
+    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
+    output_cost_per_token = model_info.get("output_cost_per_token")
+    total_prompt_cost = 0.0
+    total_completion_cost = 0.0
+    if input_cost_per_token_batches:
+        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
+    elif input_cost_per_token:
+        total_prompt_cost = (
+            usage.prompt_tokens * (input_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    if output_cost_per_token_batches:
+        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
+    elif output_cost_per_token:
+        total_completion_cost = (
+            usage.completion_tokens * (output_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+
+    return total_prompt_cost, total_completion_cost
--- a/Show more
+++ b/Show more