Merge branch 'main' into litellm_dev_03_05_2025_contributor_prs

2025-04-24 10:14:26 +00:00 · 2025-04-10 12:37:56 -07:00 · 2025-04-10 12:37:56 -07:00 · 95515b49cb
commit 95515b49cb
parent 20482ebe55 cd878bdd71
994 changed files with 66381 additions and 17305 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -3,6 +3,18 @@ orbs:
  codecov: codecov/codecov@4.0.1
  node: circleci/node@5.1.0  # Add this line to declare the node orb

+commands:
+  setup_google_dns:
+    steps:
+      - run:
+          name: "Configure Google DNS"
+          command: |
+            # Backup original resolv.conf
+            sudo cp /etc/resolv.conf /etc/resolv.conf.backup
+            # Add both local and Google DNS servers
+            echo "nameserver 127.0.0.11" | sudo tee /etc/resolv.conf
+            echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf
+            echo "nameserver 8.8.4.4" | sudo tee -a /etc/resolv.conf

 jobs:
  local_testing:
@ -15,7 +27,7 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -49,7 +61,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.68.2
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -66,12 +78,12 @@ jobs:
            pip install python-multipart
            pip install google-cloud-aiplatform
            pip install prometheus-client==0.20.0
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
            pip install "pytest-xdist==3.6.1"
-            pip install "websockets==10.4"
+            pip install "websockets==13.1.0"
            pip uninstall posthog -y
      - save_cache:
          paths:
@ -134,7 +146,7 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -168,7 +180,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.68.2
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -185,10 +197,11 @@ jobs:
            pip install python-multipart
            pip install google-cloud-aiplatform
            pip install prometheus-client==0.20.0
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -233,7 +246,13 @@ jobs:

    steps:
      - checkout
-
+      - setup_google_dns
+      - run:
+          name: DNS lookup for Redis host
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y dnsutils
+            dig redis-19899.c239.us-east-1-2.ec2.redns.redis-cloud.com +short
      - run:
          name: Show git commit hash
          command: |
@ -267,7 +286,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.68.2
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -284,10 +303,11 @@ jobs:
            pip install python-multipart
            pip install google-cloud-aiplatform
            pip install prometheus-client==0.20.0
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -332,6 +352,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -386,6 +407,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -402,7 +424,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -s -v --junitxml=test-results/junit.xml --durations=5
+            python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -v --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      - run:
          name: Rename the coverage files
@ -427,6 +449,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -477,7 +500,13 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
-
+      - run:
+          name: Install PostgreSQL
+          command: |
+            sudo apt-get update
+            sudo apt-get install postgresql postgresql-contrib
+            echo 'export PATH=/usr/lib/postgresql/*/bin:$PATH' >> $BASH_ENV
+      - setup_google_dns
      - run:
          name: Show git commit hash
          command: |
@ -511,7 +540,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.68.2
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -528,10 +557,11 @@ jobs:
            pip install python-multipart
            pip install google-cloud-aiplatform
            pip install prometheus-client==0.20.0
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "pytest-postgresql==7.0.1"
      - save_cache:
          paths:
            - ./venv
@ -567,7 +597,7 @@ jobs:
            - litellm_proxy_unit_tests_coverage
  litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword
    docker:
-      - image: cimg/python:3.11
+      - image: cimg/python:3.13.1
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
@ -575,10 +605,13 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
+            pip install wheel
+            pip install --upgrade pip wheel setuptools
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "respx==0.21.1"
@ -616,6 +649,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -623,7 +657,13 @@ jobs:
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+      - run:
+          name: Show current pydantic version
+          command: |
+            python -m pip show pydantic
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
@ -646,6 +686,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -662,7 +703,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -v --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      - run:
          name: Rename the coverage files
@ -678,6 +719,94 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
+  mcp_testing:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+
+    steps:
+      - checkout
+      - setup_google_dns
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+            pip install "pydantic==2.10.2"
+            pip install "mcp==1.5.0"
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/mcp_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml mcp_coverage.xml
+            mv .coverage mcp_coverage
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - mcp_coverage.xml
+            - mcp_coverage
+  llm_responses_api_testing:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+
+    steps:
+      - checkout
+      - setup_google_dns
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml llm_responses_api_coverage.xml
+            mv .coverage llm_responses_api_coverage
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - llm_responses_api_coverage.xml
+            - llm_responses_api_coverage
  litellm_mapped_tests:
    docker:
      - image: cimg/python:3.11
@ -688,6 +817,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -700,6 +830,8 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "respx==0.21.1"
            pip install "hypercorn==0.17.3"
+            pip install "pydantic==2.10.2"
+            pip install "mcp==1.5.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
@ -732,6 +864,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -776,10 +909,12 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
+            pip install numpydoc
            python -m pip install -r requirements.txt
            pip install "respx==0.21.1"
            pip install "pytest==7.3.1"
@ -788,7 +923,6 @@ jobs:
            pip install "pytest-cov==5.0.0"
            pip install "google-generativeai==0.3.2"
            pip install "google-cloud-aiplatform==1.43.0"
-            pip install numpydoc
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
@ -822,6 +956,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -864,6 +999,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -906,6 +1042,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -952,6 +1089,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -964,8 +1102,8 @@ jobs:
            pip install click
            pip install "boto3==1.34.34"
            pip install jinja2
-            pip install tokenizers=="0.20.0"
-            pip install uvloop==0.21.0
+            pip install "tokenizers==0.20.0"
+            pip install "uvloop==0.21.0"
            pip install jsonschema
      - run:
          name: Run tests
@ -984,10 +1122,12 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
+            python -m pip install wheel setuptools
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
@ -1008,6 +1148,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      # Install Helm
      - run:
          name: Install Helm
@ -1077,6 +1218,7 @@ jobs:

    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Dependencies
          command: |
@ -1113,6 +1255,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Python 3.9
          command: |
@ -1187,6 +1330,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1234,7 +1378,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.68.2"
      - run:
          name: Install Grype
          command: |
@ -1309,19 +1453,20 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m

      # Store test results
      - store_test_results:
          path: test-results
-  e2e_openai_misc_endpoints:
+  e2e_openai_endpoints:
    machine:
      image: ubuntu-2204:2023.10.1
    resource_class: xlarge
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1358,6 +1503,7 @@ jobs:
            pip install "boto3==1.34.34"
            pip install "aioboto3==12.3.0"
            pip install langchain
+            pip install "langchain_mcp_adapters==0.0.5"
            pip install "langfuse>=2.0.0"
            pip install "logfire==0.29.0"
            pip install numpydoc
@ -1370,7 +1516,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.68.2"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1432,7 +1578,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/openai_misc_endpoints_tests --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -s -vv tests/openai_endpoints_tests --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m

      # Store test results
@ -1445,6 +1591,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1492,7 +1639,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.68.2"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1599,6 +1746,96 @@ jobs:
      # Store test results
      - store_test_results:
          path: test-results
+  proxy_spend_accuracy_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - setup_google_dns
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
+      - run:
+          name: Run Docker container
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e OPENAI_API_KEY=$OPENAI_API_KEY \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              -e AWS_REGION_NAME=$AWS_REGION_NAME \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/spend_tracking_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for app to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/spend_tracking_tests -x --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout:
+            120m
+            # Clean up first container
+      - run:
+          name: Stop and remove first container
+          command: |
+            docker stop my-app
+            docker rm my-app

  proxy_multi_instance_tests:
    machine:
@ -1607,6 +1844,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1718,6 +1956,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1757,7 +1996,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
              -e STORE_MODEL_IN_DB="True" \
              -e LITELLM_MASTER_KEY="sk-1234" \
              -e LITELLM_LICENSE=$LITELLM_LICENSE \
@ -1800,6 +2039,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      # Remove Docker CLI installation since it's already available in machine executor
      - run:
          name: Install Python 3.13
@ -1897,6 +2137,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Install Docker CLI (In case it's not already installed)
          command: |
@ -1921,10 +2162,10 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.68.2"
            pip install "assemblyai==0.37.0"
            python -m pip install --upgrade pip
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "pytest==7.3.1"
            pip install "pytest-mock==3.12.0"
            pip install "pytest-asyncio==0.21.1"
@ -1935,12 +2176,15 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install "anthropic==0.21.3"
+            pip install "anthropic==0.49.0"
+            pip install "langchain_mcp_adapters==0.0.5"
+            pip install "langchain_openai==0.2.1"
+            pip install "langgraph==0.3.18"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -2068,7 +2312,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
+            coverage combine llm_translation_coverage llm_responses_api_coverage mcp_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -2153,6 +2397,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build UI
          command: |
@ -2197,9 +2442,9 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.68.2"
            python -m pip install --upgrade pip
-            pip install "pydantic==2.7.1"
+            pip install "pydantic==2.10.2"
            pip install "pytest==7.3.1"
            pip install "pytest-mock==3.12.0"
            pip install "pytest-asyncio==0.21.1"
@ -2267,6 +2512,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build Docker image
          command: |
@ -2289,6 +2535,7 @@ jobs:
    working_directory: ~/project
    steps:
      - checkout
+      - setup_google_dns
      - run:
          name: Build Docker image
          command: |
@ -2387,7 +2634,7 @@ workflows:
              only:
                - main
                - /litellm_.*/
-      - e2e_openai_misc_endpoints:
+      - e2e_openai_endpoints:
          filters:
            branches:
              only:
@ -2399,6 +2646,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - proxy_spend_accuracy_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - proxy_multi_instance_tests:
          filters:
            branches:
@ -2429,6 +2682,18 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - mcp_testing:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
+      - llm_responses_api_testing:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - litellm_mapped_tests:
          filters:
            branches:
@ -2468,6 +2733,8 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
+            - mcp_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2522,10 +2789,12 @@ workflows:
          requires:
            - local_testing
            - build_and_test
-            - e2e_openai_misc_endpoints
+            - e2e_openai_endpoints
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
+            - mcp_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2544,6 +2813,7 @@ workflows:
            - installing_litellm_on_python
            - installing_litellm_on_python_3_13
            - proxy_logging_guardrails_model_info_tests
+            - proxy_spend_accuracy_tests
            - proxy_multi_instance_tests
            - proxy_store_model_in_db_tests
            - proxy_build_from_pip_tests
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,13 +1,15 @@
 # used by CI/CD testing
-openai==1.54.0 
+openai==1.68.2
 python-dotenv
 tiktoken
 importlib_metadata
 cohere
-redis
+redis==5.2.1
+redisvl==0.4.1
 anthropic
 orjson==3.9.15
-pydantic==2.7.1
+pydantic==2.10.2
 google-cloud-aiplatform==1.43.0
-fastapi-sso==0.10.0
+fastapi-sso==0.16.0 
 uvloop==0.21.0
+mcp==1.5.0    # for MCP server  
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -6,6 +6,16 @@

 <!-- e.g. "Fixes #000" -->

+## Pre-Submission checklist
+
+**Please complete all items before asking a LiteLLM maintainer to review your PR**
+
+- [ ] I have Added testing in the [`tests/litellm/`](https://github.com/BerriAI/litellm/tree/main/tests/litellm) directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
+- [ ] I have added a screenshot of my new test passing locally 
+- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
+- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
+
+
 ## Type

 <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@

 ## Changes

-<!-- List of changes -->
-
-## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
-If UI changes, send a screenshot/GIF of working UI fixes
-
-<!-- Test procedure -->

--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -80,7 +80,6 @@ jobs:
    permissions:
      contents: read
      packages: write
-      #
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
@ -112,7 +111,11 @@ jobs:
        with:
          context: .
          push: true
-          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
+          tags: |
+            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
          
@ -151,8 +154,12 @@ jobs:
          context: .
          file: ./docker/Dockerfile.database
          push: true
-          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} 
-          labels: ${{ steps.meta-database.outputs.labels }} 
+          tags: |
+            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }}
+          labels: ${{ steps.meta-database.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
            
  build-and-push-image-non_root:
@ -190,7 +197,11 @@ jobs:
          context: .
          file: ./docker/Dockerfile.non_root
          push: true
-          tags: ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} 
+          tags: |
+            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta-non_root.outputs.labels }} 
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  
@ -229,7 +240,11 @@ jobs:
          context: .
          file: ./litellm-js/spend-logs/Dockerfile
          push: true
-          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
+          tags: |
+            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8

  build-and-push-helm-chart:
--- a/.github/workflows/helm_unit_test.yml
+++ b/.github/workflows/helm_unit_test.yml
@ -0,0 +1,27 @@
+name: Helm unit test
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  unit-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Helm 3.11.1
+        uses: azure/setup-helm@v1
+        with:
+          version: '3.11.1'
+
+      - name: Install Helm Unit Test Plugin
+        run: |
+          helm plugin install https://github.com/helm-unittest/helm-unittest --version v0.4.4
+
+      - name: Run unit tests
+        run:
+          helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/.github/workflows/publish-migrations.yml
+++ b/.github/workflows/publish-migrations.yml
@ -0,0 +1,206 @@
+name: Publish Prisma Migrations
+
+permissions:
+  contents: write
+  pull-requests: write
+
+on:
+  push:
+    paths:
+      - 'schema.prisma'  # Check root schema.prisma
+    branches:
+      - main
+
+jobs:
+  publish-migrations:
+    runs-on: ubuntu-latest
+    services:
+      postgres:
+        image: postgres:14
+        env:
+          POSTGRES_DB: temp_db
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+      
+      # Add shadow database service
+      postgres_shadow:
+        image: postgres:14
+        env:
+          POSTGRES_DB: shadow_db
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5433:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Install Dependencies
+        run: |
+          pip install prisma
+          pip install python-dotenv
+
+      - name: Generate Initial Migration if None Exists
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          mkdir -p deploy/migrations
+          echo 'provider = "postgresql"' > deploy/migrations/migration_lock.toml
+          
+          if [ -z "$(ls -A deploy/migrations/2* 2>/dev/null)" ]; then
+            echo "No existing migrations found, creating baseline..."
+            VERSION=$(date +%Y%m%d%H%M%S)
+            mkdir -p deploy/migrations/${VERSION}_initial
+            
+            echo "Generating initial migration..."
+            # Save raw output for debugging
+            prisma migrate diff \
+              --from-empty \
+              --to-schema-datamodel schema.prisma \
+              --shadow-database-url "${SHADOW_DATABASE_URL}" \
+              --script > deploy/migrations/${VERSION}_initial/raw_migration.sql
+            
+            echo "Raw migration file content:"
+            cat deploy/migrations/${VERSION}_initial/raw_migration.sql
+            
+            echo "Cleaning migration file..."
+            # Clean the file
+            sed '/^Installing/d' deploy/migrations/${VERSION}_initial/raw_migration.sql > deploy/migrations/${VERSION}_initial/migration.sql
+            
+            # Verify the migration file
+            if [ ! -s deploy/migrations/${VERSION}_initial/migration.sql ]; then
+              echo "ERROR: Migration file is empty after cleaning"
+              echo "Original content was:"
+              cat deploy/migrations/${VERSION}_initial/raw_migration.sql
+              exit 1
+            fi
+            
+            echo "Final migration file content:"
+            cat deploy/migrations/${VERSION}_initial/migration.sql
+            
+            # Verify it starts with SQL
+            if ! head -n 1 deploy/migrations/${VERSION}_initial/migration.sql | grep -q "^--\|^CREATE\|^ALTER"; then
+              echo "ERROR: Migration file does not start with SQL command or comment"
+              echo "First line is:"
+              head -n 1 deploy/migrations/${VERSION}_initial/migration.sql
+              echo "Full content is:"
+              cat deploy/migrations/${VERSION}_initial/migration.sql
+              exit 1
+            fi
+            
+            echo "Initial migration generated at $(date -u)" > deploy/migrations/${VERSION}_initial/README.md
+          fi
+
+      - name: Compare and Generate Migration
+        if: success()
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          # Create temporary migration workspace
+          mkdir -p temp_migrations
+          
+          # Copy existing migrations (will not fail if directory is empty)
+          cp -r deploy/migrations/* temp_migrations/ 2>/dev/null || true
+          
+          VERSION=$(date +%Y%m%d%H%M%S)
+          
+          # Generate diff against existing migrations or empty state
+          prisma migrate diff \
+            --from-migrations temp_migrations \
+            --to-schema-datamodel schema.prisma \
+            --shadow-database-url "${SHADOW_DATABASE_URL}" \
+            --script > temp_migrations/migration_${VERSION}.sql
+          
+          # Check if there are actual changes
+          if [ -s temp_migrations/migration_${VERSION}.sql ]; then
+            echo "Changes detected, creating new migration"
+            mkdir -p deploy/migrations/${VERSION}_schema_update
+            mv temp_migrations/migration_${VERSION}.sql deploy/migrations/${VERSION}_schema_update/migration.sql
+            echo "Migration generated at $(date -u)" > deploy/migrations/${VERSION}_schema_update/README.md
+          else
+            echo "No schema changes detected"
+            exit 0
+          fi
+
+      - name: Verify Migration
+        if: success()
+        env:
+          DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db"
+          SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db"
+        run: |
+          # Create test database
+          psql "${SHADOW_DATABASE_URL}" -c 'CREATE DATABASE migration_test;'
+          
+          # Apply all migrations in order to verify
+          for migration in deploy/migrations/*/migration.sql; do
+            echo "Applying migration: $migration"
+            psql "${SHADOW_DATABASE_URL}" -f $migration
+          done
+
+      # Add this step before create-pull-request to debug permissions
+      - name: Check Token Permissions
+        run: |
+          echo "Checking token permissions..."
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+               -H "Accept: application/vnd.github.v3+json" \
+               https://api.github.com/repos/BerriAI/litellm/collaborators
+          
+          echo "\nChecking if token can create PRs..."
+          curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+               -H "Accept: application/vnd.github.v3+json" \
+               https://api.github.com/repos/BerriAI/litellm
+
+      # Add this debug step before git push
+      - name: Debug Changed Files
+        run: |
+          echo "Files staged for commit:"
+          git diff --name-status --staged
+          
+          echo "\nAll changed files:"
+          git status
+
+      - name: Create Pull Request
+        if: success()
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: "chore: update prisma migrations"
+          title: "Update Prisma Migrations"
+          body: |
+            Auto-generated migration based on schema.prisma changes.
+            
+            Generated files:
+            - deploy/migrations/${VERSION}_schema_update/migration.sql
+            - deploy/migrations/${VERSION}_schema_update/README.md
+          branch: feat/prisma-migration-${{ env.VERSION }}
+          base: main
+          delete-branch: true
+
+      - name: Generate and Save Migrations
+        run: |
+          # Only add migration files
+          git add deploy/migrations/
+          git status  # Debug what's being committed
+          git commit -m "chore: update prisma migrations" 
--- a/.github/workflows/test-linting.yml
+++ b/.github/workflows/test-linting.yml
@ -0,0 +1,53 @@
+name: LiteLLM Linting
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Install dependencies
+      run: |
+        poetry install --with dev
+
+    - name: Run Black formatting
+      run: |
+        cd litellm
+        poetry run black .
+        cd ..
+
+    - name: Run Ruff linting
+      run: |
+        cd litellm
+        poetry run ruff check .
+        cd ..
+
+    - name: Run MyPy type checking
+      run: |
+        cd litellm
+        poetry run mypy . --ignore-missing-imports
+        cd ..
+
+    - name: Check for circular imports
+      run: |
+        cd litellm
+        poetry run python ../tests/documentation_tests/test_circular_imports.py
+        cd ..
+
+    - name: Check import safety
+      run: |
+        poetry run python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
--- a/.github/workflows/test-litellm.yml
+++ b/.github/workflows/test-litellm.yml
@ -0,0 +1,35 @@
+name: LiteLLM Mock Tests (folder - tests/litellm)
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Thank You Message
+      run: |
+        echo "### 🙏 Thank you for contributing to LiteLLM!" >> $GITHUB_STEP_SUMMARY
+        echo "Your PR is being tested now. We appreciate your help in making LiteLLM better!" >> $GITHUB_STEP_SUMMARY
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Install dependencies
+      run: |
+        poetry install --with dev,proxy-dev --extras proxy
+        poetry run pip install pytest-xdist
+
+    - name: Run tests
+      run: |
+        poetry run pytest tests/litellm -x -vv -n 4 
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+.python-version
 .venv
 .env
 .newenv
@ -76,4 +77,11 @@ litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/model_hub.html
 .mypy_cache/*
-litellm/proxy/application.log
+litellm/proxy/application.log
+tests/llm_translation/vertex_test_account.json
+tests/llm_translation/test_vertex_key.json
+litellm/proxy/migrations/0_init/migration.sql
+litellm/proxy/db/migrations/0_init/migration.sql
+litellm/proxy/db/migrations/*
+litellm/proxy/migrations/*config.yaml
+litellm/proxy/migrations/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,44 +6,35 @@ repos:
        entry: pyright
        language: system
        types: [python]
-        files: ^litellm/
+        files: ^(litellm/|litellm_proxy_extras/)
    -   id: isort
        name: isort
        entry: isort
        language: system
        types: [python]
-        files: litellm/.*\.py
+        files: (litellm/|litellm_proxy_extras/).*\.py
        exclude: ^litellm/__init__.py$
-   repo: https://github.com/psf/black
-    rev: 24.2.0
-    hooks:
-    - id: black
+    -   id: black
+        name: black
+        entry: poetry run black
+        language: system
+        types: [python]
+        files: (litellm/|litellm_proxy_extras/).*\.py
 -   repo: https://github.com/pycqa/flake8
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
       exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/
       additional_dependencies: [flake8-print]
-       files: litellm/.*\.py
-    # -  id: flake8
-    #    name: flake8 (router.py function length)
-    #    files: ^litellm/router\.py$
-    #    args: [--max-function-length=40]
-    # #    additional_dependencies: [flake8-functions]
+       files: (litellm/|litellm_proxy_extras/).*\.py
 -   repo: https://github.com/python-poetry/poetry
    rev: 1.8.0
    hooks:
      - id: poetry-check
+        files: ^(pyproject.toml|litellm-proxy-extras/pyproject.toml)$
 -   repo: local
    hooks:
    -   id: check-files-match
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
-        language: system
-    # -   id: check-file-length
-    #     name: Check file length
-    #     entry: python check_file_length.py
-    #     args: ["10000"]  # set your desired maximum number of lines
-    #     language: python
-    #     files: litellm/.*\.py
-    #     exclude: ^litellm/tests/
+        language: system
--- a/9
+++ b/9
@ -12,8 +12,7 @@ WORKDIR /app
 USER root

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev openssl openssl-dev
+RUN apk add --no-cache gcc python3-dev openssl openssl-dev


 RUN pip install --upgrade pip && \
@ -37,9 +36,6 @@ RUN pip install dist/*.whl
 # install dependencies as wheels
 RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
-RUN pip install redisvl==0.0.7 --no-deps
-
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
@ -55,8 +51,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime
 USER root

 # Install runtime dependencies
-RUN apk update && \
-    apk add --no-cache openssl
+RUN apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
--- a/35
+++ b/35
@ -0,0 +1,35 @@
+# LiteLLM Makefile
+# Simple Makefile for running tests and basic development tasks
+
+.PHONY: help test test-unit test-integration lint format
+
+# Default target
+help:
+	@echo "Available commands:"
+	@echo "  make test               - Run all tests"
+	@echo "  make test-unit          - Run unit tests"
+	@echo "  make test-integration   - Run integration tests"
+	@echo "  make test-unit-helm     - Run helm unit tests"
+
+install-dev:
+	poetry install --with dev
+
+install-proxy-dev:
+	poetry install --with dev,proxy-dev
+
+lint: install-dev
+	poetry run pip install types-requests types-setuptools types-redis types-PyYAML
+	cd litellm && poetry run mypy . --ignore-missing-imports
+
+# Testing
+test:
+	poetry run pytest tests/
+
+test-unit:
+	poetry run pytest tests/litellm/
+
+test-integration:
+	poetry run pytest tests/ -k "not litellm"
+
+test-unit-helm:
+	helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/README.md
+++ b/README.md
@ -16,9 +16,6 @@
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
    </a>
-    <a href="https://dl.circleci.com/status-badge/redirect/gh/BerriAI/litellm/tree/main" target="_blank">
-        <img src="https://dl.circleci.com/status-badge/img/gh/BerriAI/litellm/tree/main.svg?style=svg" alt="CircleCI">
-    </a>
    <a href="https://www.ycombinator.com/companies/berriai">
        <img src="https://img.shields.io/badge/Y%20Combinator-W23-orange?style=flat-square" alt="Y Combinator W23">
    </a>
@ -340,71 +337,7 @@ curl 'http://0.0.0.0:4000/key/generate' \

 ## Contributing

-To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
-
-Here's how to modify the repo locally:
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Install dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-Step 3: Test your change:
-
-a. Add a pytest test within `tests/litellm/`
-
-This folder follows the same directory structure as `litellm/`.
-
-If a corresponding test file does not exist, create one.
-
-b. Run the test
-
-```
-cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
-pytest /path/to/test_file.py
-```
-
-Step 4: Submit a PR with your changes! 🚀
-
- push your fork to your GitHub repo
- submit a PR from there
-
-### Building LiteLLM Docker Image 
-
-Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Build the Docker Image
-
-Build using Dockerfile.non_root
-```
-docker build -f docker/Dockerfile.non_root -t litellm_test_image .
-```
-
-Step 3: Run the Docker Image
-
-Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
-```
-docker run \
-    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
-    -e DATABASE_URL="postgresql://xxxxxxxx" \
-    -e LITELLM_MASTER_KEY="sk-1234" \
-    -p 4000:4000 \
-    litellm_test_image \
-    --config /app/config.yaml --detailed_debug
-```
+Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)

 # Enterprise
 For companies that need better security, user management and professional support
--- a/ci_cd/baseline_db.py
+++ b/ci_cd/baseline_db.py
@ -0,0 +1,60 @@
+import subprocess
+from pathlib import Path
+from datetime import datetime
+
+
+def create_baseline():
+    """Create baseline migration in deploy/migrations"""
+    try:
+        # Get paths
+        root_dir = Path(__file__).parent.parent
+        deploy_dir = root_dir / "deploy"
+        migrations_dir = deploy_dir / "migrations"
+        schema_path = root_dir / "schema.prisma"
+
+        # Create migrations directory
+        migrations_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create migration_lock.toml if it doesn't exist
+        lock_file = migrations_dir / "migration_lock.toml"
+        if not lock_file.exists():
+            lock_file.write_text('provider = "postgresql"\n')
+
+        # Create timestamp-based migration directory
+        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+        migration_dir = migrations_dir / f"{timestamp}_baseline"
+        migration_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate migration SQL
+        result = subprocess.run(
+            [
+                "prisma",
+                "migrate",
+                "diff",
+                "--from-empty",
+                "--to-schema-datamodel",
+                str(schema_path),
+                "--script",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        # Write the SQL to migration.sql
+        migration_file = migration_dir / "migration.sql"
+        migration_file.write_text(result.stdout)
+
+        print(f"Created baseline migration in {migration_dir}")
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error running prisma command: {e.stderr}")
+        return False
+    except Exception as e:
+        print(f"Error creating baseline migration: {str(e)}")
+        return False
+
+
+if __name__ == "__main__":
+    create_baseline()
--- a/ci_cd/publish-proxy-extras.sh
+++ b/ci_cd/publish-proxy-extras.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+echo "🚀 Building and publishing litellm-proxy-extras"
+
+# Navigate to litellm-proxy-extras directory
+cd "$(dirname "$0")/../litellm-proxy-extras"
+
+# Build the package
+echo "📦 Building package..."
+poetry build
+
+# Publish to PyPI
+echo "🌎 Publishing to PyPI..."
+poetry publish
+
+echo "✅ Done! Package published successfully"
--- a/ci_cd/run_migration.py
+++ b/ci_cd/run_migration.py
@ -0,0 +1,95 @@
+import os
+import subprocess
+from pathlib import Path
+from datetime import datetime
+import testing.postgresql
+import shutil
+
+
+def create_migration(migration_name: str = None):
+    """
+    Create a new migration SQL file in the migrations directory by comparing
+    current database state with schema
+
+    Args:
+        migration_name (str): Name for the migration
+    """
+    try:
+        # Get paths
+        root_dir = Path(__file__).parent.parent
+        migrations_dir = root_dir / "litellm-proxy-extras" / "litellm_proxy_extras" / "migrations"
+        schema_path = root_dir / "schema.prisma"
+
+        # Create temporary PostgreSQL database
+        with testing.postgresql.Postgresql() as postgresql:
+            db_url = postgresql.url()
+
+            # Create temporary migrations directory next to schema.prisma
+            temp_migrations_dir = schema_path.parent / "migrations"
+
+            try:
+                # Copy existing migrations to temp directory
+                if temp_migrations_dir.exists():
+                    shutil.rmtree(temp_migrations_dir)
+                shutil.copytree(migrations_dir, temp_migrations_dir)
+
+                # Apply existing migrations to temp database
+                os.environ["DATABASE_URL"] = db_url
+                subprocess.run(
+                    ["prisma", "migrate", "deploy", "--schema", str(schema_path)],
+                    check=True,
+                )
+
+                # Generate diff between current database and schema
+                result = subprocess.run(
+                    [
+                        "prisma",
+                        "migrate",
+                        "diff",
+                        "--from-url",
+                        db_url,
+                        "--to-schema-datamodel",
+                        str(schema_path),
+                        "--script",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+
+                if result.stdout.strip():
+                    # Generate timestamp and create migration directory
+                    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+                    migration_name = migration_name or "unnamed_migration"
+                    migration_dir = migrations_dir / f"{timestamp}_{migration_name}"
+                    migration_dir.mkdir(parents=True, exist_ok=True)
+
+                    # Write the SQL to migration.sql
+                    migration_file = migration_dir / "migration.sql"
+                    migration_file.write_text(result.stdout)
+
+                    print(f"Created migration in {migration_dir}")
+                    return True
+                else:
+                    print("No schema changes detected. Migration not needed.")
+                    return False
+
+            finally:
+                # Clean up: remove temporary migrations directory
+                if temp_migrations_dir.exists():
+                    shutil.rmtree(temp_migrations_dir)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating migration: {e.stderr}")
+        return False
+    except Exception as e:
+        print(f"Error creating migration: {str(e)}")
+        return False
+
+
+if __name__ == "__main__":
+    # If running directly, can optionally pass migration name as argument
+    import sys
+
+    migration_name = sys.argv[1] if len(sys.argv) > 1 else None
+    create_migration(migration_name)
--- a/cookbook/LiteLLM_HuggingFace.ipynb
+++ b/cookbook/LiteLLM_HuggingFace.ipynb
@ -6,8 +6,9 @@
        "id": "9dKM5k8qsMIj"
      },
      "source": [
-        "## LiteLLM HuggingFace\n",
-        "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface"
+        "## LiteLLM Hugging Face\n",
+        "\n",
+        "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface\n"
      ]
    },
    {
@ -27,23 +28,18 @@
        "id": "yp5UXRqtpu9f"
      },
      "source": [
-        "## Hugging Face Free Serverless Inference API\n",
-        "Read more about the Free Serverless Inference API here: https://huggingface.co/docs/api-inference.\n",
+        "## Serverless Inference Providers\n",
        "\n",
-        "In order to use litellm to call Serverless Inference API:\n",
+        "Read more about Inference Providers here: https://huggingface.co/blog/inference-providers.\n",
        "\n",
-        "* Browse Serverless Inference compatible models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation.\n",
-        "* Copy the model name from hugging face\n",
-        "* Set `model = \"huggingface/<model-name>\"`\n",
+        "In order to use litellm with Hugging Face Inference Providers, you need to set `model=huggingface/<provider>/<model-id>`.\n",
        "\n",
-        "Example set `model=huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct` to call `meta-llama/Meta-Llama-3.1-8B-Instruct`\n",
-        "\n",
-        "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct"
+        "Example: `huggingface/together/deepseek-ai/DeepSeek-R1` to run DeepSeek-R1 (https://huggingface.co/deepseek-ai/DeepSeek-R1) through Together AI.\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@ -51,107 +47,18 @@
        "id": "Pi5Oww8gpCUm",
        "outputId": "659a67c7-f90d-4c06-b94e-2c4aa92d897a"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "ModelResponse(id='chatcmpl-c54dfb68-1491-4d68-a4dc-35e603ea718a', choices=[Choices(finish_reason='eos_token', index=0, message=Message(content=\"I'm just a computer program, so I don't have feelings, but thank you for asking! How can I assist you today?\", role='assistant', tool_calls=None, function_call=None))], created=1724858285, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=27, prompt_tokens=47, total_tokens=74))\n",
-            "ModelResponse(id='chatcmpl-d2ae38e6-4974-431c-bb9b-3fa3f95e5a6d', choices=[Choices(finish_reason='length', index=0, message=Message(content=\"\\n\\nI’m doing well, thank you. I’ve been keeping busy with work and some personal projects. How about you?\\n\\nI'm doing well, thank you. I've been enjoying some time off and catching up on some reading. How can I assist you today?\\n\\nI'm looking for a good book to read. Do you have any recommendations?\\n\\nOf course! Here are a few book recommendations across different genres:\\n\\n1.\", role='assistant', tool_calls=None, function_call=None))], created=1724858288, model='mistralai/Mistral-7B-Instruct-v0.3', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=85, prompt_tokens=6, total_tokens=91))\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "import os\n",
-        "import litellm\n",
+        "from litellm import completion\n",
        "\n",
-        "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
+        "# You can create a HF token here: https://huggingface.co/settings/tokens\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
        "\n",
-        "# Call https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(response)\n",
-        "\n",
-        "\n",
-        "# Call https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/mistralai/Mistral-7B-Instruct-v0.3\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(response)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-klhAhjLtclv"
-      },
-      "source": [
-        "## Hugging Face Dedicated Inference Endpoints\n",
-        "\n",
-        "Steps to use\n",
-        "* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
-        "* Set `api_base` to your deployed api base\n",
-        "* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Lbmw8Gl_pHns",
-        "outputId": "ea8408bf-1cc3-4670-ecea-f12666d204a8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "{\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"content\": \"\\n\\nI am doing well, thank you for asking. How about you?\\nI am doing\",\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"logprobs\": -8.9481967812\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"id\": \"chatcmpl-74dc9d89-3916-47ce-9bea-b80e66660f77\",\n",
-            "  \"created\": 1695871068.8413374,\n",
-            "  \"model\": \"glaiveai/glaive-coder-7b\",\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 6,\n",
-            "    \"completion_tokens\": 18,\n",
-            "    \"total_tokens\": 24\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "import litellm\n",
-        "\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
-        "\n",
-        "# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "# set api base to your deployed api endpoint from hugging face\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/glaiveai/glaive-coder-7b\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    api_base=\"https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud\"\n",
+        "# Call DeepSeek-R1 model through Together AI\n",
+        "response = completion(\n",
+        "    model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
+        "    messages=[{\"content\": \"How many r's are in the word `strawberry`?\", \"role\": \"user\"}],\n",
        ")\n",
        "print(response)"
      ]
@ -162,13 +69,12 @@
        "id": "EU0UubrKzTFe"
      },
      "source": [
-        "## HuggingFace - Streaming (Serveless or Dedicated)\n",
-        "Set stream = True"
+        "## Streaming\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@ -176,74 +82,147 @@
        "id": "y-QfIvA-uJKX",
        "outputId": "b007bb98-00d0-44a4-8264-c8a2caed6768"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "<litellm.utils.CustomStreamWrapper object at 0x1278471d0>\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='I', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'m\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' just', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' a', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' computer', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' program', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' so', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' don', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'t\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' have', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' feelings', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' but', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' thank', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' for', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' asking', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='!', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' How', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' can', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' assist', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' today', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='?', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='<|eot_id|>', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "import os\n",
-        "import litellm\n",
+        "from litellm import completion\n",
        "\n",
-        "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
        "\n",
-        "# Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "# set api base to your deployed api endpoint from hugging face\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    stream=True\n",
+        "response = completion(\n",
+        "    model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"How many r's are in the word `strawberry`?\",\n",
+        "            \n",
+        "        }\n",
+        "    ],\n",
+        "    stream=True,\n",
        ")\n",
        "\n",
-        "print(response)\n",
-        "\n",
        "for chunk in response:\n",
-        "  print(chunk)"
+        "    print(chunk)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## With images as input\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
-      "metadata": {
-        "id": "CKXAnK55zQRl"
-      },
+      "metadata": {},
      "outputs": [],
-      "source": []
+      "source": [
+        "from litellm import completion\n",
+        "\n",
+        "# Set your Hugging Face Token\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
+        "\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\"type\": \"text\", \"text\": \"What's in this image?\"},\n",
+        "            {\n",
+        "                \"type\": \"image_url\",\n",
+        "                \"image_url\": {\n",
+        "                    \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n",
+        "                },\n",
+        "            },\n",
+        "        ],\n",
+        "    }\n",
+        "]\n",
+        "\n",
+        "response = completion(\n",
+        "    model=\"huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct\",\n",
+        "    messages=messages,\n",
+        ")\n",
+        "print(response.choices[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tools - Function Calling\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from litellm import completion\n",
+        "\n",
+        "\n",
+        "# Set your Hugging Face Token\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
+        "\n",
+        "tools = [\n",
+        "    {\n",
+        "        \"type\": \"function\",\n",
+        "        \"function\": {\n",
+        "            \"name\": \"get_current_weather\",\n",
+        "            \"description\": \"Get the current weather in a given location\",\n",
+        "            \"parameters\": {\n",
+        "                \"type\": \"object\",\n",
+        "                \"properties\": {\n",
+        "                    \"location\": {\n",
+        "                        \"type\": \"string\",\n",
+        "                        \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
+        "                    },\n",
+        "                    \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
+        "                },\n",
+        "                \"required\": [\"location\"],\n",
+        "            },\n",
+        "        },\n",
+        "    }\n",
+        "]\n",
+        "messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
+        "\n",
+        "response = completion(\n",
+        "    model=\"huggingface/sambanova/meta-llama/Llama-3.1-8B-Instruct\", messages=messages, tools=tools, tool_choice=\"auto\"\n",
+        ")\n",
+        "print(response)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Hugging Face Dedicated Inference Endpoints\n",
+        "\n",
+        "Steps to use\n",
+        "\n",
+        "- Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
+        "- Set `api_base` to your deployed api base\n",
+        "- set the model to `huggingface/tgi` so that litellm knows it's a huggingface Deployed Inference Endpoint.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import litellm\n",
+        "\n",
+        "\n",
+        "response = litellm.completion(\n",
+        "    model=\"huggingface/tgi\",\n",
+        "    messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}],\n",
+        "    api_base=\"https://my-endpoint.endpoints.huggingface.cloud/v1/\",\n",
+        ")\n",
+        "print(response)"
+      ]
    }
  ],
  "metadata": {
@ -251,7 +230,8 @@
      "provenance": []
    },
    "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": ".venv",
+      "language": "python",
      "name": "python3"
    },
    "language_info": {
@ -264,7 +244,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.12.2"
+      "version": "3.12.0"
    }
  },
  "nbformat": 4,
--- a/cookbook/litellm-ollama-docker-image/requirements.txt
+++ b/cookbook/litellm-ollama-docker-image/requirements.txt
@ -1 +1 @@
-litellm==1.55.3
+litellm==1.61.15
--- a/cookbook/misc/dev_release.txt
+++ b/cookbook/misc/dev_release.txt
@ -1,2 +1,11 @@
 python3 -m build
-twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - 
+twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - 
+
+
+Note: You might need to make a MANIFEST.ini file on root for build process incase it fails 
+
+Place this in MANIFEST.ini
+recursive-exclude venv *
+recursive-exclude myenv *
+recursive-exclude py313_env *
+recursive-exclude **/.venv *
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.4.1
+version: 0.4.2

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -22,6 +22,8 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | Name                                                       | Description                                                                                                                                                                           | Value |
 | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
 | `replicaCount`                                             | The number of LiteLLM Proxy pods to be deployed                                                                                                                                       | `1`  |
+| `masterkeySecretName`                                      | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use the generated secret name.                                                                                                         | N/A  |
+| `masterkeySecretKey`                                      | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use `masterkey` as the key.                                                                                                         | N/A  |
 | `masterkey`                                                | The Master API Key for LiteLLM.  If not specified, a random key is generated.                                                                                                         | N/A  |
 | `environmentSecrets`                                       | An optional array of Secret object names.  The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
 | `environmentConfigMaps`                                       | An optional array of ConfigMap object names.  The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -78,8 +78,8 @@ spec:
            - name: PROXY_MASTER_KEY
              valueFrom:
                secretKeyRef:
-                  name: {{ include "litellm.fullname" . }}-masterkey
-                  key: masterkey
+                  name: {{ .Values.masterkeySecretName | default (printf "%s-masterkey" (include "litellm.fullname" .)) }}
+                  key: {{ .Values.masterkeySecretKey | default "masterkey" }}
            {{- if .Values.redis.enabled }}
            - name: REDIS_HOST
              value: {{ include "litellm.redis.serviceName" . }}
--- a/deploy/charts/litellm-helm/templates/migrations-job.yaml
+++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml
@ -65,6 +65,6 @@ spec:
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
-      ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }}
+  ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }}
  backoffLimit: {{ .Values.migrationJob.backoffLimit }}
 {{- end }}
--- a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
+++ b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
@ -1,3 +1,4 @@
+{{- if not .Values.masterkeySecretName }}
 {{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
 apiVersion: v1
 kind: Secret
@ -5,4 +6,5 @@ metadata:
  name: {{ include "litellm.fullname" . }}-masterkey
 data:
  masterkey: {{ $masterkey | b64enc }}
-type: Opaque
+type: Opaque
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/service.yaml
+++ b/deploy/charts/litellm-helm/templates/service.yaml
@ -2,6 +2,10 @@ apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "litellm.fullname" . }}
+  {{- with .Values.service.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
  labels:
    {{- include "litellm.labels" . | nindent 4 }}
 spec:
--- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml
@ -0,0 +1,82 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+  - configmap-litellm.yaml
+tests:
+  - it: should work
+    template: deployment.yaml
+    set:
+      image.tag: test
+    asserts:
+      - isKind:
+          of: Deployment
+      - matchRegex:
+          path: metadata.name
+          pattern: -litellm$
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: ghcr.io/berriai/litellm-database:test
+  - it: should work with tolerations
+    template: deployment.yaml
+    set:
+      tolerations:
+        - key: node-role.kubernetes.io/master
+          operator: Exists
+          effect: NoSchedule
+    asserts:
+      - equal:
+          path: spec.template.spec.tolerations[0].key
+          value: node-role.kubernetes.io/master
+      - equal:
+          path: spec.template.spec.tolerations[0].operator
+          value: Exists
+  - it: should work with affinity
+    template: deployment.yaml
+    set:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: topology.kubernetes.io/zone
+                operator: In
+                values:
+                - antarctica-east1
+    asserts:
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key
+          value: topology.kubernetes.io/zone
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator
+          value: In
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]
+          value: antarctica-east1
+  - it: should work without masterkeySecretName or masterkeySecretKey
+    template: deployment.yaml
+    set:
+      masterkeySecretName: ""
+      masterkeySecretKey: ""
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: PROXY_MASTER_KEY
+            valueFrom:
+              secretKeyRef:
+                name: RELEASE-NAME-litellm-masterkey
+                key: masterkey
+  - it: should work with masterkeySecretName and masterkeySecretKey
+    template: deployment.yaml
+    set:
+      masterkeySecretName: my-secret
+      masterkeySecretKey: my-key
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: PROXY_MASTER_KEY
+            valueFrom:
+              secretKeyRef:
+                name: my-secret
+                key: my-key
--- a/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
@ -0,0 +1,18 @@
+suite: test masterkey secret
+templates:
+  - secret-masterkey.yaml
+tests:
+  - it: should create a secret if masterkeySecretName is not set
+    template: secret-masterkey.yaml
+    set:
+      masterkeySecretName: ""
+    asserts:
+      - isKind:
+          of: Secret
+  - it: should not create a secret if masterkeySecretName is set
+    template: secret-masterkey.yaml
+    set:
+      masterkeySecretName: my-secret
+    asserts:
+      - hasDocuments:
+          count: 0
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -75,6 +75,12 @@ ingress:

 # masterkey: changeit

+# if set, use this secret for the master key; otherwise, autogenerate a new one
+masterkeySecretName: ""
+
+# if set, use this secret key for the master key; otherwise, use the default key
+masterkeySecretKey: ""
+
 # The elements within proxy_config are rendered as config.yaml for the proxy
 #  Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
 #  Reference: https://docs.litellm.ai/docs/proxy/configs
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,10 +20,18 @@ services:
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
+    depends_on:
+      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
+    healthcheck:  # Defines the health check configuration for the container
+      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
+      interval: 30s  # Perform health check every 30 seconds
+      timeout: 10s   # Health check command times out after 10 seconds
+      retries: 3     # Retry up to 3 times if health check fails
+      start_period: 40s  # Wait 40 seconds after container start before beginning health checks

 
  db:
-    image: postgres
+    image: postgres:16
    restart: always
    environment:
      POSTGRES_DB: litellm
@ -31,6 +39,8 @@ services:
      POSTGRES_PASSWORD: dbpassword9090
    ports:
      - "5432:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data  # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
@ -53,6 +63,6 @@ services:
 volumes:
  prometheus_data:
    driver: local
+  postgres_data:
+    name: litellm_postgres_data  # Named volume for Postgres data persistence

-
-# ...rest of your docker-compose config if any
--- a/docker/Dockerfile.alpine
+++ b/docker/Dockerfile.alpine
@ -35,7 +35,7 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 FROM $LITELLM_RUNTIME_IMAGE AS runtime

 # Update dependencies and clean up
-RUN apk update && apk upgrade && rm -rf /var/cache/apk/*
+RUN apk upgrade --no-cache

 WORKDIR /app

--- a/docker/Dockerfile.database
+++ b/docker/Dockerfile.database
@ -12,8 +12,7 @@ WORKDIR /app
 USER root

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev openssl openssl-dev
+RUN apk add --no-cache gcc python3-dev openssl openssl-dev


 RUN pip install --upgrade pip && \
@ -44,8 +43,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime
 USER root

 # Install runtime dependencies
-RUN apk update && \
-    apk add --no-cache openssl
+RUN apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
@ -59,9 +57,6 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
-RUN pip install redisvl==0.0.7 --no-deps
-
 # ensure pyjwt is used, not jwt
 RUN pip uninstall jwt -y
 RUN pip uninstall PyJWT -y
--- a/docker/Dockerfile.non_root
+++ b/docker/Dockerfile.non_root
@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]

 # Install build dependencies
 RUN apt-get clean && apt-get update && \
-    apt-get install -y gcc python3-dev && \
+    apt-get install -y gcc g++ python3-dev && \
    rm -rf /var/lib/apt/lists/*

 RUN pip install --no-cache-dir --upgrade pip && \
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
 # ensure pyjwt is used, not jwt
-RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
-    pip uninstall jwt -y && \
+RUN pip uninstall jwt -y && \
    pip uninstall PyJWT -y && \
    pip install PyJWT==2.9.0 --no-cache-dir

--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -0,0 +1,301 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# /v1/messages [BETA] 
+
+Use LiteLLM to call all your LLM APIs in the Anthropic `v1/messages` format. 
+
+
+## Overview 
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ✅ |  |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | between anthropic models |
+| Loadbalancing | ✅ | between anthropic models |
+
+Planned improvement:
+- Vertex AI Anthropic support
+- Bedrock Anthropic support
+
+## Usage 
+---
+
+### LiteLLM Python SDK 
+
+#### Non-streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+
+Example response:
+```json
+{
+  "content": [
+    {
+      "text": "Hi! this is a very short joke",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+
+#### Streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+    stream=True,
+)
+async for chunk in response:
+    print(chunk)
+```
+
+### LiteLLM Proxy Server 
+
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem label="Anthropic Python SDK" value="python">
+
+```python showLineNumbers title="Example using LiteLLM Proxy Server"
+import anthropic
+
+# point anthropic sdk to litellm proxy 
+client = anthropic.Anthropic(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234",
+)
+
+response = client.messages.create(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    model="anthropic-claude",
+    max_tokens=100,
+)
+```
+</TabItem>
+<TabItem label="curl" value="curl">
+
+```bash showLineNumbers title="Example using LiteLLM Proxy Server"
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, can you tell me a short joke?"
+    }
+  ],
+  "max_tokens": 100
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+## Request Format
+---
+
+Request body will be in the Anthropic messages API format. **litellm follows the Anthropic messages specification for this endpoint.**
+
+#### Example request body
+
+```json
+{
+  "model": "claude-3-7-sonnet-20250219",
+  "max_tokens": 1024,
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, world"
+    }
+  ]
+}
+```
+
+#### Required Fields
+- **model** (string):  
+  The model identifier (e.g., `"claude-3-7-sonnet-20250219"`).
+- **max_tokens** (integer):  
+  The maximum number of tokens to generate before stopping.  
+  _Note: The model may stop before reaching this limit; value must be greater than 1._
+- **messages** (array of objects):  
+  An ordered list of conversational turns.  
+  Each message object must include:
+  - **role** (enum: `"user"` or `"assistant"`):  
+    Specifies the speaker of the message.
+  - **content** (string or array of content blocks):  
+    The text or content blocks (e.g., an array containing objects with a `type` such as `"text"`) that form the message.  
+    _Example equivalence:_
+    ```json
+    {"role": "user", "content": "Hello, Claude"}
+    ```
+    is equivalent to:
+    ```json
+    {"role": "user", "content": [{"type": "text", "text": "Hello, Claude"}]}
+    ```
+
+#### Optional Fields
+- **metadata** (object):  
+  Contains additional metadata about the request (e.g., `user_id` as an opaque identifier).
+- **stop_sequences** (array of strings):  
+  Custom sequences that, when encountered in the generated text, cause the model to stop.
+- **stream** (boolean):  
+  Indicates whether to stream the response using server-sent events.
+- **system** (string or array):  
+  A system prompt providing context or specific instructions to the model.
+- **temperature** (number):  
+  Controls randomness in the model’s responses. Valid range: `0 < temperature < 1`.
+- **thinking** (object):  
+  Configuration for enabling extended thinking. If enabled, it includes:
+  - **budget_tokens** (integer):  
+    Minimum of 1024 tokens (and less than `max_tokens`).
+  - **type** (enum):  
+    E.g., `"enabled"`.
+- **tool_choice** (object):  
+  Instructs how the model should utilize any provided tools.
+- **tools** (array of objects):  
+  Definitions for tools available to the model. Each tool includes:
+  - **name** (string):  
+    The tool’s name.
+  - **description** (string):  
+    A detailed description of the tool.
+  - **input_schema** (object):  
+    A JSON schema describing the expected input format for the tool.
+- **top_k** (integer):  
+  Limits sampling to the top K options.
+- **top_p** (number):  
+  Enables nucleus sampling with a cumulative probability cutoff. Valid range: `0 < top_p < 1`.
+
+
+## Response Format
+---
+
+Responses will be in the Anthropic messages API format.
+
+#### Example Response
+
+```json
+{
+  "content": [
+    {
+      "text": "Hi! My name is Claude.",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+
+#### Response fields
+
+- **content** (array of objects):  
+  Contains the generated content blocks from the model. Each block includes:
+  - **type** (string):  
+    Indicates the type of content (e.g., `"text"`, `"tool_use"`, `"thinking"`, or `"redacted_thinking"`).
+  - **text** (string):  
+    The generated text from the model.  
+    _Note: Maximum length is 5,000,000 characters._
+  - **citations** (array of objects or `null`):  
+    Optional field providing citation details. Each citation includes:
+    - **cited_text** (string):  
+      The excerpt being cited.
+    - **document_index** (integer):  
+      An index referencing the cited document.
+    - **document_title** (string or `null`):  
+      The title of the cited document.
+    - **start_char_index** (integer):  
+      The starting character index for the citation.
+    - **end_char_index** (integer):  
+      The ending character index for the citation.
+    - **type** (string):  
+      Typically `"char_location"`.
+
+- **id** (string):  
+  A unique identifier for the response message.  
+  _Note: The format and length of IDs may change over time._
+
+- **model** (string):  
+  Specifies the model that generated the response.
+
+- **role** (string):  
+  Indicates the role of the generated message. For responses, this is always `"assistant"`.
+
+- **stop_reason** (string):  
+  Explains why the model stopped generating text. Possible values include:
+  - `"end_turn"`: The model reached a natural stopping point.
+  - `"max_tokens"`: The generation stopped because the maximum token limit was reached.
+  - `"stop_sequence"`: A custom stop sequence was encountered.
+  - `"tool_use"`: The model invoked one or more tools.
+
+- **stop_sequence** (string or `null`):  
+  Contains the specific stop sequence that caused the generation to halt, if applicable; otherwise, it is `null`.
+
+- **type** (string):  
+  Denotes the type of response object, which is always `"message"`.
+
+- **usage** (object):  
+  Provides details on token usage for billing and rate limiting. This includes:
+  - **input_tokens** (integer):  
+    Total number of input tokens processed.
+  - **output_tokens** (integer):  
+    Total number of output tokens generated.
+  - **cache_creation_input_tokens** (integer or `null`):  
+    Number of tokens used to create a cache entry.
+  - **cache_read_input_tokens** (integer or `null`):  
+    Number of tokens read from the cache.
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Assistants API 
+# /assistants

 Covers Threads, Messages, Assistants. 

--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [BETA] Batches API
+# /batches

 Covers Batches, Files

--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk

-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py)
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching/caching.py)

 :::info

@ -26,7 +26,7 @@ Install redis
 pip install redis
 ```

-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/

 ```python
 import litellm
@ -37,11 +37,11 @@ litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password

 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Tell me a joke."}]
 )

@ -91,12 +91,12 @@ response2 = completion(

 <TabItem value="redis-sem" label="redis-semantic cache">

-Install redis
+Install redisvl client
 ```shell
-pip install redisvl==0.0.7
+pip install redisvl==0.4.1
 ```

-For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/

 ```python
 import litellm
@ -114,6 +114,7 @@ litellm.cache = Cache(
    port=os.environ["REDIS_PORT"],
    password=os.environ["REDIS_PASSWORD"],
    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
+    ttl=120,
    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
@ -471,11 +472,13 @@ def __init__(
    password: Optional[str] = None,
    namespace: Optional[str] = None,
    default_in_redis_ttl: Optional[float] = None,
-    similarity_threshold: Optional[float] = None,
-    redis_semantic_cache_use_async=False,
-    redis_semantic_cache_embedding_model="text-embedding-ada-002",
    redis_flush_size=None,

+    # redis semantic cache params
+    similarity_threshold: Optional[float] = None,
+    redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+    redis_semantic_cache_index_name: Optional[str] = None,
+
    # s3 Bucket, boto3 configuration
    s3_bucket_name: Optional[str] = None,
    s3_region_name: Optional[str] = None,
--- a/docs/my-website/docs/completion/document_understanding.md
+++ b/docs/my-website/docs/completion/document_understanding.md
@ -27,16 +27,18 @@ os.environ["AWS_REGION_NAME"] = ""


 # pdf url
-image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+file_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"

 # model
 model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"

-image_content = [
+file_content = [
    {"type": "text", "text": "What's this file about?"},
    {
-        "type": "image_url",
-        "image_url": image_url, # OR {"url": image_url}
+        "type": "file",
+        "file": {
+            "file_id": file_url,
+        }
    },
 ]

@ -46,7 +48,7 @@ if not supports_pdf_input(model, None):

 response = completion(
    model=model,
-    messages=[{"role": "user", "content": image_content}],
+    messages=[{"role": "user", "content": file_content}],
 )
 assert response is not None
 ```
@ -80,11 +82,15 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -d '{
    "model": "bedrock-model",
    "messages": [
-        {"role": "user", "content": {"type": "text", "text": "What's this file about?"}},
-        {
-            "type": "image_url",
-            "image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
-        }
+        {"role": "user", "content": [
+            {"type": "text", "text": "What's this file about?"},
+            {
+                "type": "file",
+                "file": {
+                    "file_id": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+                }
+            }
+        ]},
    ]
 }'
 ```
@ -116,11 +122,13 @@ base64_url = f"data:application/pdf;base64,{encoded_file}"
 # model
 model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"

-image_content = [
+file_content = [
    {"type": "text", "text": "What's this file about?"},
    {
-        "type": "image_url",
-        "image_url": base64_url, # OR {"url": base64_url}
+        "type": "file",
+        "file": {
+            "file_data": base64_url,
+        }
    },
 ]

@ -130,11 +138,53 @@ if not supports_pdf_input(model, None):

 response = completion(
    model=model,
-    messages=[{"role": "user", "content": image_content}],
+    messages=[{"role": "user", "content": file_content}],
 )
 assert response is not None
 ```
 </TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-model
+    litellm_params:
+      model: bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0
+      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
+      aws_region_name: os.environ/AWS_REGION_NAME
+```
+
+2. Start the proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "bedrock-model",
+    "messages": [
+        {"role": "user", "content": [
+            {"type": "text", "text": "What's this file about?"},
+            {
+                "type": "file",
+                "file": {
+                    "file_data": "data:application/pdf;base64...",
+                }
+            }
+        ]},
+    ]
+}'
+```
+</TabItem>
 </Tabs>

 ## Checking if a model supports pdf input
--- a/docs/my-website/docs/completion/drop_params.md
+++ b/docs/my-website/docs/completion/drop_params.md
@ -107,4 +107,76 @@ response = litellm.completion(
 </TabItem>
 </Tabs>

-**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
+**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
+
+## Specify allowed openai params in a request
+
+Tell litellm to allow specific openai params in a request. Use this if you get a `litellm.UnsupportedParamsError` and want to allow a param. LiteLLM will pass the param as is to the model.
+
+
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM Python SDK">
+
+In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param.
+
+```python showLineNumbers title="Pass allowed_openai_params to LiteLLM Python SDK"
+await litellm.acompletion(
+    model="azure/o_series/<my-deployment-name>",
+    api_key="xxxxx",
+    api_base=api_base,
+    messages=[{"role": "user", "content": "Hello! return a json object"}],
+    tools=[{"type": "function", "function": {"name": "get_current_time", "description": "Get the current time in a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name, e.g. San Francisco"}}, "required": ["location"]}}}]
+    allowed_openai_params=["tools"],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">
+
+When using litellm proxy you can pass `allowed_openai_params` in two ways:
+
+1. Dynamically pass `allowed_openai_params` in a request
+2. Set `allowed_openai_params` on the config.yaml file for a specific model
+
+#### Dynamically pass allowed_openai_params in a request
+In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param for a request sent to the model set on the proxy.
+
+```python showLineNumbers title="Dynamically pass allowed_openai_params in a request"
+import openai
+from openai import AsyncAzureOpenAI
+
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={ 
+        "allowed_openai_params": ["tools"]
+    }
+)
+```
+
+#### Set allowed_openai_params on config.yaml
+
+You can also set `allowed_openai_params` on the config.yaml file for a specific model. This means that all requests to this deployment are allowed to pass in the `tools` param.
+
+```yaml showLineNumbers title="Set allowed_openai_params on config.yaml"
+model_list:
+  - model_name: azure-o1-preview
+    litellm_params:
+      model: azure/o_series/<my-deployment-name>
+      api_key: xxxxx
+      api_base: https://openai-prod-test.openai.azure.com/openai/deployments/o1/chat/completions?api-version=2025-01-01-preview
+      allowed_openai_params: ["tools"]
+```
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/completion/prompt_caching.md
+++ b/docs/my-website/docs/completion/prompt_caching.md
@ -3,7 +3,13 @@ import TabItem from '@theme/TabItem';

 # Prompt Caching 

-For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
+Supported Providers:
+- OpenAI (`openai/`)
+- Anthropic API (`anthropic/`)
+- Bedrock (`bedrock/`, `bedrock/invoke/`, `bedrock/converse`) ([All models bedrock supports prompt caching on](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html))
+- Deepseek API (`deepseek/`)
+
+For the supported providers, LiteLLM follows the OpenAI prompt caching usage object format:

 ```bash
 "usage": {
@ -499,4 +505,4 @@ curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
 </TabItem>
 </Tabs>

-This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -189,4 +189,138 @@ Expected Response
 ```

 </TabItem>
-</Tabs>
+</Tabs>
+
+
+## Explicitly specify image type 
+
+If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
+
+```python
+"image_url": {
+  "url": "gs://my-gs-image",
+  "format": "image/jpeg"
+}
+```
+
+LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai). 
+
+For others (e.g. openai), it will be ignored. 
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+```python
+import os 
+from litellm import completion
+
+os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "claude-3-7-sonnet-latest", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                  "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+</TabItem>
+<TabItem label="PROXY" value="proxy">
+
+1. Define vision models on config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
+    litellm_params:
+      model: openai/gpt-4-vision-preview
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: llava-hf          # Custom OpenAI compatible model
+    litellm_params:
+      model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
+      api_base: http://localhost:8000
+      api_key: fake-key
+    model_info:
+      supports_vision: True        # set supports_vision to True so /model/info returns this attribute as True
+
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it using the OpenAI Python SDK
+
+
+```python
+import os 
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # your litellm proxy api key
+)
+
+response = client.chat.completions.create(
+    model = "gpt-4-vision-preview",  # use model="llava-hf" to test your custom OpenAI endpoint
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
+
+
+## Spec 
+
+```
+"image_url": str
+
+OR 
+
+"image_url": {
+  "url": "url OR base64 encoded str",
+  "detail": "openai-only param", 
+  "format": "specify mime-type of image"
+}
+```
--- a/docs/my-website/docs/completion/web_search.md
+++ b/docs/my-website/docs/completion/web_search.md
@ -0,0 +1,308 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Using Web Search
+
+Use web search with litellm
+
+| Feature | Details |
+|---------|---------|
+| Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
+| Supported Providers | `openai` |
+| LiteLLM Cost Tracking | ✅ Supported |
+| LiteLLM Version | `v1.63.15-nightly` or higher |
+
+
+## `/chat/completions` (litellm.completion)
+
+### Quick Start
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion
+
+response = completion(
+    model="openai/gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?",
+        }
+    ],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o-search-preview
+    litellm_params:
+      model: openai/gpt-4o-search-preview
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ]
+)
+```
+</TabItem>
+</Tabs>
+
+### Search context size
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion
+
+# Customize search context size
+response = completion(
+    model="openai/gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?",
+        }
+    ],
+    web_search_options={
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# Customize search context size
+response = client.chat.completions.create(
+    model="gpt-4o-search-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    web_search_options={
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }
+)
+```
+</TabItem>
+</Tabs>
+
+## `/responses` (litellm.responses)
+
+### Quick Start
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import responses
+
+response = responses(
+    model="openai/gpt-4o",
+    input=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    tools=[{
+        "type": "web_search_preview"  # enables web search with default medium context size
+    }]
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.responses.create(
+    model="gpt-4o",
+    tools=[{
+        "type": "web_search_preview"
+    }],
+    input="What was a positive news story from today?",
+)
+
+print(response.output_text)
+```
+</TabItem>
+</Tabs>
+
+### Search context size
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import responses
+
+# Customize search context size
+response = responses(
+    model="openai/gpt-4o",
+    input=[
+        {
+            "role": "user",
+            "content": "What was a positive news story from today?"
+        }
+    ],
+    tools=[{
+        "type": "web_search_preview",
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }]
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```python showLineNumbers
+from openai import OpenAI
+
+# Point to your proxy server
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# Customize search context size
+response = client.responses.create(
+    model="gpt-4o",
+    tools=[{
+        "type": "web_search_preview",
+        "search_context_size": "low"  # Options: "low", "medium" (default), "high"
+    }],
+    input="What was a positive news story from today?",
+)
+
+print(response.output_text)
+```
+</TabItem>
+</Tabs>
+
+
+
+
+
+
+## Checking if a model supports web search
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
+
+```python showLineNumbers
+assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
+```
+</TabItem>
+
+<TabItem label="PROXY" value="proxy">
+
+1. Define OpenAI models in config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4o-search-preview
+    litellm_params:
+      model: openai/gpt-4o-search-preview
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      supports_web_search: True
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Call `/model_group/info` to check if a model supports web search
+
+```shell
+curl -X 'GET' \
+  'http://localhost:4000/model_group/info' \
+  -H 'accept: application/json' \
+  -H 'x-api-key: sk-1234'
+```
+
+Expected Response 
+
+```json showLineNumbers
+{
+  "data": [
+    {
+      "model_group": "gpt-4o-search-preview",
+      "providers": ["openai"],
+      "max_tokens": 128000,
+      "supports_web_search": true, # 👈 supports_web_search is true
+    }
+  ]
+}
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Embeddings
+# /embeddings

 ## Quick Start
 ```python
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -1,3 +1,5 @@
+import Image from '@theme/IdealImage';
+
 # Enterprise
 For companies that need SSO, user management and professional support for LiteLLM Proxy

@ -7,6 +9,8 @@ Get free 7-day trial key [here](https://www.litellm.ai/#trial)

 Includes all enterprise features.

+<Image img={require('../img/enterprise_vs_oss.png')} />
+
 [**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs)


@ -34,9 +38,9 @@ You can use our cloud product where we setup a dedicated instance for you.

 Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting.  We can’t solve your own infrastructure-related issues but we will guide you to fix them.

- 1 hour for Sev0 issues
- 6 hours for Sev1
- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday)
+- 1 hour for Sev0 issues - 100% production traffic is failing
+- 6 hours for Sev1 - <100% production traffic is failing
+- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) - setup issues e.g. Redis working on our end, but not on your infrastructure.
 - 72h SLA for patching vulnerabilities in the software. 

 **We can offer custom SLAs** based on your needs and the severity of the issue
--- a/docs/my-website/docs/extras/contributing_code.md
+++ b/docs/my-website/docs/extras/contributing_code.md
@ -0,0 +1,106 @@
+# Contributing Code
+
+## **Checklist before submitting a PR**
+
+Here are the core requirements for any PR submitted to LiteLLM
+
+
+- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
+- [ ] Ensure your PR passes the following tests:
+    - [ ] [Unit Tests](#3-running-unit-tests)
+    - [ ] [Formatting / Linting Tests](#35-running-linting-tests)
+- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
+
+
+
+## Quick start
+
+## 1. Setup your local dev environment
+
+
+Here's how to modify the repo locally:
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Install dev dependencies:
+
+```shell
+poetry install --with dev --extras proxy
+```
+
+That's it, your local dev environment is ready!
+
+## 2. Adding Testing to your PR
+
+- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
+
+- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
+- Do not add real llm api calls to this directory.
+
+### 2.1 File Naming Convention for `tests/litellm/`
+
+The `tests/litellm/` directory follows the same directory structure as `litellm/`.
+
+- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
+- `test_{filename}.py` maps to `litellm/{filename}.py`
+
+## 3. Running Unit Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make test-unit
+```
+
+## 3.5 Running Linting Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make lint
+```
+
+LiteLLM uses mypy for linting. On ci/cd we also run `black` for formatting.
+
+## 4. Submit a PR with your changes!
+
+- push your fork to your GitHub repo
+- submit a PR from there
+
+
+## Advanced
+### Building LiteLLM Docker Image 
+
+Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Build the Docker Image
+
+Build using Dockerfile.non_root
+
+```shell
+docker build -f docker/Dockerfile.non_root -t litellm_test_image .
+```
+
+Step 3: Run the Docker Image
+
+Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
+
+```shell
+docker run \
+    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
+    -e DATABASE_URL="postgresql://xxxxxxxx" \
+    -e LITELLM_MASTER_KEY="sk-1234" \
+    -p 4000:4000 \
+    litellm_test_image \
+    --config /app/config.yaml --detailed_debug
+```
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,7 +2,7 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';

-# Files API
+# /files

 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

@ -14,48 +14,105 @@ Files are used to upload documents that can be used with features like Assistant
 - Delete File
 - Get File Content

+
+
 <Tabs>
 <TabItem value="proxy" label="LiteLLM PROXY Server">

-```bash
-$ export OPENAI_API_KEY="sk-..."
+### 1. Setup config.yaml

-$ litellm
-
-# RUNNING on http://0.0.0.0:4000
+```
+# for /files endpoints
+files_settings:
+  - custom_llm_provider: azure
+    api_base: https://exampleopenaiendpoint-production.up.railway.app
+    api_key: fake-key
+    api_version: "2023-03-15-preview"
+  - custom_llm_provider: openai
+    api_key: os.environ/OPENAI_API_KEY
 ```

-**Upload a File**
+### 2. Start LiteLLM PROXY Server
+
 ```bash
-curl http://localhost:4000/v1/files \
-  -H "Authorization: Bearer sk-1234" \
-  -F purpose="fine-tune" \
-  -F file="@mydata.jsonl"
+litellm --config /path/to/config.yaml
+
+## RUNNING on http://0.0.0.0:4000
 ```

-**List Files**
-```bash
-curl http://localhost:4000/v1/files \
-  -H "Authorization: Bearer sk-1234"
+### 3. Use OpenAI's /files endpoints
+
+Upload a File
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-...",
+    base_url="http://0.0.0.0:4000/v1"
+)
+
+client.files.create(
+    file=wav_data,
+    purpose="user_data",
+    extra_body={"custom_llm_provider": "openai"}
+)
 ```

-**Retrieve File Information**
-```bash
-curl http://localhost:4000/v1/files/file-abc123 \
-  -H "Authorization: Bearer sk-1234"
+List Files
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-...",
+    base_url="http://0.0.0.0:4000/v1"
+)
+
+files = client.files.list(extra_body={"custom_llm_provider": "openai"})
+print("files=", files)
 ```

-**Delete File**
-```bash
-curl http://localhost:4000/v1/files/file-abc123 \
-  -X DELETE \
-  -H "Authorization: Bearer sk-1234"
+Retrieve File Information
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-...",
+    base_url="http://0.0.0.0:4000/v1"
+)
+
+file = client.files.retrieve(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
+print("file=", file)
 ```

-**Get File Content**
-```bash
-curl http://localhost:4000/v1/files/file-abc123/content \
-  -H "Authorization: Bearer sk-1234"
+Delete File
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-...",
+    base_url="http://0.0.0.0:4000/v1"
+)
+
+response = client.files.delete(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
+print("delete response=", response)
+```
+
+Get File Content
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-...",
+    base_url="http://0.0.0.0:4000/v1"
+)
+
+content = client.files.content(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"})
+print("content=", content)
 ```

 </TabItem>
@ -120,7 +177,7 @@ print("file content=", content)

 ### [OpenAI](#quick-start)

-## [Azure OpenAI](./providers/azure#azure-batches-api)
+### [Azure OpenAI](./providers/azure#azure-batches-api)

 ### [Vertex AI](./providers/vertex#batch-apis)

--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [Beta] Fine-tuning API
+# /fine_tuning


 :::info
--- a/docs/my-website/docs/guides/security_settings.md
+++ b/docs/my-website/docs/guides/security_settings.md
@ -0,0 +1,66 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# SSL Security Settings
+
+If you're in an environment using an older TTS bundle, with an older encryption, follow this guide. 
+
+
+LiteLLM uses HTTPX for network requests, unless otherwise specified. 
+
+1. Disable SSL verification
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+litellm.ssl_verify = False
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+litellm_settings:
+  ssl_verify: false
+```
+
+</TabItem>  
+<TabItem value="env_var" label="Environment Variables">
+
+```bash
+export SSL_VERIFY="False"
+```
+</TabItem>
+</Tabs>
+
+2. Lower security settings
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+litellm.ssl_security_level = 1
+litellm.ssl_certificate = "/path/to/certificate.pem"
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+litellm_settings:
+  ssl_security_level: 1
+  ssl_certificate: "/path/to/certificate.pem"
+```
+</TabItem>
+<TabItem value="env_var" label="Environment Variables">
+
+```bash
+export SSL_SECURITY_LEVEL="1"
+export SSL_CERTIFICATE="/path/to/certificate.pem"
+```
+</TabItem>
+</Tabs>
+
+
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -111,8 +111,8 @@ from litellm import completion
 import os

 # auth: run 'gcloud auth application-default'
-os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
-os.environ["VERTEX_LOCATION"] = "us-central1"
+os.environ["VERTEXAI_PROJECT"] = "hardy-device-386718"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"

 response = completion(
  model="vertex_ai/gemini-1.5-pro",
--- a/docs/my-website/docs/mcp.md
+++ b/docs/my-website/docs/mcp.md
@ -0,0 +1,427 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import Image from '@theme/IdealImage';
+
+# /mcp [BETA] - Model Context Protocol
+
+## Expose MCP tools on LiteLLM Proxy Server
+
+This allows you to define tools that can be called by any MCP compatible client. Define your `mcp_servers` with LiteLLM and all your clients can list and call available tools.
+
+<Image 
+  img={require('../img/mcp_2.png')}
+  style={{width: '100%', display: 'block', margin: '2rem auto'}}
+/>
+<p style={{textAlign: 'left', color: '#666'}}>
+  LiteLLM MCP Architecture: Use MCP tools with all LiteLLM supported models
+</p>
+
+#### How it works
+
+LiteLLM exposes the following MCP endpoints:
+
+- `/mcp/tools/list` - List all available tools
+- `/mcp/tools/call` - Call a specific tool with the provided arguments
+
+When MCP clients connect to LiteLLM they can follow this workflow:
+
+1. Connect to the LiteLLM MCP server
+2. List all available tools on LiteLLM
+3. Client makes LLM API request with tool call(s)
+4. LLM API returns which tools to call and with what arguments
+5. MCP client makes MCP tool calls to LiteLLM
+6. LiteLLM makes the tool calls to the appropriate MCP server
+7. LiteLLM returns the tool call results to the MCP client
+
+#### Usage
+
+#### 1. Define your tools on under `mcp_servers` in your config.yaml file.
+
+LiteLLM allows you to define your tools on the `mcp_servers` section in your config.yaml file. All tools listed here will be available to MCP clients (when they connect to LiteLLM and call `list_tools`).
+
+```yaml title="config.yaml" showLineNumbers
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: sk-xxxxxxx
+
+mcp_servers:
+  {
+    "zapier_mcp": {
+      "url": "https://actions.zapier.com/mcp/sk-akxxxxx/sse"
+    },
+    "fetch": {
+      "url": "http://localhost:8000/sse"
+    }
+  }
+```
+
+
+#### 2. Start LiteLLM Gateway
+
+<Tabs>
+<TabItem value="docker" label="Docker Run">
+
+```shell title="Docker Run" showLineNumbers
+docker run -d \
+  -p 4000:4000 \
+  -e OPENAI_API_KEY=$OPENAI_API_KEY \
+  --name my-app \
+  -v $(pwd)/my_config.yaml:/app/config.yaml \
+  my-app:latest \
+  --config /app/config.yaml \
+  --port 4000 \
+  --detailed_debug \
+```
+
+</TabItem>
+
+<TabItem value="py" label="litellm pip">
+
+```shell title="litellm pip" showLineNumbers
+litellm --config config.yaml --detailed_debug
+```
+
+</TabItem>
+</Tabs>
+
+
+#### 3. Make an LLM API request 
+
+In this example we will do the following:
+
+1. Use MCP client to list MCP tools on LiteLLM Proxy
+2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools
+3. Provide the MCP tools to `gpt-4o`
+4. Handle tool call from `gpt-4o`
+5. Convert OpenAI tool call to MCP tool call
+6. Execute tool call on MCP server
+
+```python title="MCP Client List Tools" showLineNumbers
+import asyncio
+from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletionUserMessageParam
+from mcp import ClientSession
+from mcp.client.sse import sse_client
+from litellm.experimental_mcp_client.tools import (
+    transform_mcp_tool_to_openai_tool,
+    transform_openai_tool_call_request_to_mcp_tool_call_request,
+)
+
+
+async def main():
+    # Initialize clients
+    
+    # point OpenAI client to LiteLLM Proxy
+    client = AsyncOpenAI(api_key="sk-1234", base_url="http://localhost:4000")
+
+    # Point MCP client to LiteLLM Proxy
+    async with sse_client("http://localhost:4000/mcp/") as (read, write):
+        async with ClientSession(read, write) as session:
+            await session.initialize()
+
+            # 1. List MCP tools on LiteLLM Proxy
+            mcp_tools = await session.list_tools()
+            print("List of MCP tools for MCP server:", mcp_tools.tools)
+
+            # Create message
+            messages = [
+                ChatCompletionUserMessageParam(
+                    content="Send an email about LiteLLM supporting MCP", role="user"
+                )
+            ]
+
+            # 2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools
+            # Since OpenAI only supports tools in the OpenAI format, we need to convert the MCP tools to the OpenAI format.
+            openai_tools = [
+                transform_mcp_tool_to_openai_tool(tool) for tool in mcp_tools.tools
+            ]
+
+            # 3. Provide the MCP tools to `gpt-4o`
+            response = await client.chat.completions.create(
+                model="gpt-4o",
+                messages=messages,
+                tools=openai_tools,
+                tool_choice="auto",
+            )
+
+            # 4. Handle tool call from `gpt-4o`
+            if response.choices[0].message.tool_calls:
+                tool_call = response.choices[0].message.tool_calls[0]
+                if tool_call:
+
+                    # 5. Convert OpenAI tool call to MCP tool call
+                    # Since MCP servers expect tools in the MCP format, we need to convert the OpenAI tool call to the MCP format.
+                    # This is done using litellm.experimental_mcp_client.tools.transform_openai_tool_call_request_to_mcp_tool_call_request
+                    mcp_call = (
+                        transform_openai_tool_call_request_to_mcp_tool_call_request(
+                            openai_tool=tool_call.model_dump()
+                        )
+                    )
+
+                    # 6. Execute tool call on MCP server
+                    result = await session.call_tool(
+                        name=mcp_call.name, arguments=mcp_call.arguments
+                    )
+
+                    print("Result:", result)
+
+
+# Run it
+asyncio.run(main())
+```
+
+## LiteLLM Python SDK MCP Bridge
+
+LiteLLM Python SDK acts as a MCP bridge to utilize MCP tools with all LiteLLM supported models. LiteLLM offers the following features for using MCP
+
+- **List** Available MCP Tools: OpenAI clients can view all available MCP tools
+  - `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools
+- **Call** MCP Tools: OpenAI clients can call MCP tools
+  - `litellm.experimental_mcp_client.call_openai_tool` to call an OpenAI tool on an MCP server
+
+
+### 1. List Available MCP Tools
+
+In this example we'll use `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools on any MCP server. This method can be used in two ways:
+
+- `format="mcp"` - (default) Return MCP tools 
+  - Returns: `mcp.types.Tool`
+- `format="openai"` - Return MCP tools converted to OpenAI API compatible tools. Allows using with OpenAI endpoints.
+  - Returns: `openai.types.chat.ChatCompletionToolParam`
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM Python SDK">
+
+```python title="MCP Client List Tools" showLineNumbers
+# Create server parameters for stdio connection
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+import os
+import litellm
+from litellm import experimental_mcp_client
+
+
+server_params = StdioServerParameters(
+    command="python3",
+    # Make sure to update to the full absolute path to your mcp_server.py file
+    args=["./mcp_server.py"],
+)
+
+async with stdio_client(server_params) as (read, write):
+    async with ClientSession(read, write) as session:
+        # Initialize the connection
+        await session.initialize()
+
+        # Get tools
+        tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai")
+        print("MCP TOOLS: ", tools)
+
+        messages = [{"role": "user", "content": "what's (3 + 5)"}]
+        llm_response = await litellm.acompletion(
+            model="gpt-4o",
+            api_key=os.getenv("OPENAI_API_KEY"),
+            messages=messages,
+            tools=tools,
+        )
+        print("LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str))
+```
+
+</TabItem>
+
+<TabItem value="openai" label="OpenAI SDK + LiteLLM Proxy">
+
+In this example we'll walk through how you can use the OpenAI SDK pointed to the LiteLLM proxy to call MCP tools. The key difference here is we use the OpenAI SDK to make the LLM API request
+
+```python title="MCP Client List Tools" showLineNumbers
+# Create server parameters for stdio connection
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+import os
+from openai import OpenAI
+from litellm import experimental_mcp_client
+
+server_params = StdioServerParameters(
+    command="python3",
+    # Make sure to update to the full absolute path to your mcp_server.py file
+    args=["./mcp_server.py"],
+)
+
+async with stdio_client(server_params) as (read, write):
+    async with ClientSession(read, write) as session:
+        # Initialize the connection
+        await session.initialize()
+
+        # Get tools using litellm mcp client
+        tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai")
+        print("MCP TOOLS: ", tools)
+
+        # Use OpenAI SDK pointed to LiteLLM proxy
+        client = OpenAI(
+            api_key="your-api-key",  # Your LiteLLM proxy API key
+            base_url="http://localhost:4000"  # Your LiteLLM proxy URL
+        )
+
+        messages = [{"role": "user", "content": "what's (3 + 5)"}]
+        llm_response = client.chat.completions.create(
+            model="gpt-4",
+            messages=messages,
+            tools=tools
+        )
+        print("LLM RESPONSE: ", llm_response)
+```
+</TabItem>
+</Tabs>
+
+
+### 2. List and Call MCP Tools
+
+In this example we'll use 
+- `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools on any MCP server
+- `litellm.experimental_mcp_client.call_openai_tool` to call an OpenAI tool on an MCP server
+
+The first llm response returns a list of OpenAI tools. We take the first tool call from the LLM response and pass it to `litellm.experimental_mcp_client.call_openai_tool` to call the tool on the MCP server.
+
+#### How `litellm.experimental_mcp_client.call_openai_tool` works
+
+- Accepts an OpenAI Tool Call from the LLM response
+- Converts the OpenAI Tool Call to an MCP Tool
+- Calls the MCP Tool on the MCP server
+- Returns the result of the MCP Tool call
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM Python SDK">
+
+```python title="MCP Client List and Call Tools" showLineNumbers
+# Create server parameters for stdio connection
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+import os
+import litellm
+from litellm import experimental_mcp_client
+
+
+server_params = StdioServerParameters(
+    command="python3",
+    # Make sure to update to the full absolute path to your mcp_server.py file
+    args=["./mcp_server.py"],
+)
+
+async with stdio_client(server_params) as (read, write):
+    async with ClientSession(read, write) as session:
+        # Initialize the connection
+        await session.initialize()
+
+        # Get tools
+        tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai")
+        print("MCP TOOLS: ", tools)
+
+        messages = [{"role": "user", "content": "what's (3 + 5)"}]
+        llm_response = await litellm.acompletion(
+            model="gpt-4o",
+            api_key=os.getenv("OPENAI_API_KEY"),
+            messages=messages,
+            tools=tools,
+        )
+        print("LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str))
+
+        openai_tool = llm_response["choices"][0]["message"]["tool_calls"][0]
+        # Call the tool using MCP client
+        call_result = await experimental_mcp_client.call_openai_tool(
+            session=session,
+            openai_tool=openai_tool,
+        )
+        print("MCP TOOL CALL RESULT: ", call_result)
+
+        # send the tool result to the LLM
+        messages.append(llm_response["choices"][0]["message"])
+        messages.append(
+            {
+                "role": "tool",
+                "content": str(call_result.content[0].text),
+                "tool_call_id": openai_tool["id"],
+            }
+        )
+        print("final messages with tool result: ", messages)
+        llm_response = await litellm.acompletion(
+            model="gpt-4o",
+            api_key=os.getenv("OPENAI_API_KEY"),
+            messages=messages,
+            tools=tools,
+        )
+        print(
+            "FINAL LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str)
+        )
+```
+
+</TabItem>
+<TabItem value="proxy" label="OpenAI SDK + LiteLLM Proxy">
+
+In this example we'll walk through how you can use the OpenAI SDK pointed to the LiteLLM proxy to call MCP tools. The key difference here is we use the OpenAI SDK to make the LLM API request
+
+```python title="MCP Client with OpenAI SDK" showLineNumbers
+# Create server parameters for stdio connection
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+import os
+from openai import OpenAI
+from litellm import experimental_mcp_client
+
+server_params = StdioServerParameters(
+    command="python3",
+    # Make sure to update to the full absolute path to your mcp_server.py file
+    args=["./mcp_server.py"],
+)
+
+async with stdio_client(server_params) as (read, write):
+    async with ClientSession(read, write) as session:
+        # Initialize the connection
+        await session.initialize()
+
+        # Get tools using litellm mcp client
+        tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai")
+        print("MCP TOOLS: ", tools)
+
+        # Use OpenAI SDK pointed to LiteLLM proxy
+        client = OpenAI(
+            api_key="your-api-key",  # Your LiteLLM proxy API key
+            base_url="http://localhost:8000"  # Your LiteLLM proxy URL
+        )
+
+        messages = [{"role": "user", "content": "what's (3 + 5)"}]
+        llm_response = client.chat.completions.create(
+            model="gpt-4",
+            messages=messages,
+            tools=tools
+        )
+        print("LLM RESPONSE: ", llm_response)
+
+        # Get the first tool call
+        tool_call = llm_response.choices[0].message.tool_calls[0]
+        
+        # Call the tool using MCP client
+        call_result = await experimental_mcp_client.call_openai_tool(
+            session=session,
+            openai_tool=tool_call.model_dump(),
+        )
+        print("MCP TOOL CALL RESULT: ", call_result)
+
+        # Send the tool result back to the LLM
+        messages.append(llm_response.choices[0].message.model_dump())
+        messages.append({
+            "role": "tool",
+            "content": str(call_result.content[0].text),
+            "tool_call_id": tool_call.id,
+        })
+
+        final_response = client.chat.completions.create(
+            model="gpt-4",
+            messages=messages,
+            tools=tools
+        )
+        print("FINAL RESPONSE: ", final_response)
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Moderation
+# /moderations


 ### Usage
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -1,4 +1,7 @@
+
 import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

 # Arize AI

@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm

 :::

+<Image img={require('../../img/arize.png')} />
+


 ## Pre-Requisites
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
 ```python
 litellm.callbacks = ["arize"]
 ```
+
 ```python
+
 import litellm
 import os

@ -48,7 +55,7 @@ response = litellm.completion(

 ### Using with LiteLLM Proxy

-
+1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
@ -60,13 +67,134 @@ model_list:
 litellm_settings:
  callbacks: ["arize"]

+general_settings:
+  master_key: "sk-1234" # can also be set as an environment variable
+
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
-    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
+    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
 ```

+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
+```
+
+## Pass Arize Space/Key per-request
+
+Supported parameters:
+- `arize_api_key`
+- `arize_space_key`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+import os
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set arize as a callback, litellm will send the data to arize
+litellm.callbacks = ["arize"]
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
+  arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["arize"]
+
+general_settings:
+  master_key: "sk-1234" # can also be set as an environment variable
+```
+
+2. Start the proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+<Tabs>
+<TabItem value="curl" label="CURL">
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-4",
+  "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
+  "arize_api_key": "ARIZE_SPACE_2_API_KEY",
+  "arize_space_key": "ARIZE_SPACE_2_KEY"
+}'
+```
+</TabItem>
+<TabItem value="openai_python" label="OpenAI Python">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+      "arize_api_key": "ARIZE_SPACE_2_API_KEY",
+      "arize_space_key": "ARIZE_SPACE_2_KEY"
+    }
+)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+</TabItem>
+</Tabs>
+
 ## Support & Talk to Founders

 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,6 +78,9 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
+* `user_feedback: Optional[str]` - The end user’s feedback.
+* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
 * `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.

 ## Using a self hosted deployment of Athina
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
@ -15,6 +15,91 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ

 Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai`

+LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through:
+
+1. **Specific Credentials**: Admin sets passthrough credentials for a specific project/region.
+
+2. **Default Credentials**: Admin sets default credentials.
+
+3. **Client-Side Credentials**: User can send client-side credentials through to Vertex AI (default behavior - if no default or mapped credentials are found, the request is passed through directly).
+
+
+## Example Usage
+
+<Tabs>
+<TabItem value="specific_credentials" label="Specific Project/Region">
+
+```yaml
+model_list:
+  - model_name: gemini-1.0-pro
+    litellm_params:
+      model: vertex_ai/gemini-1.0-pro
+      vertex_project: adroit-crow-413218
+      vertex_region: us-central1
+      vertex_credentials: /path/to/credentials.json
+      use_in_pass_through: true # 👈 KEY CHANGE
+```
+
+</TabItem>
+<TabItem value="default_credentials" label="Default Credentials">
+
+<Tabs>
+<TabItem value="yaml" label="Set in config.yaml">
+
+```yaml
+default_vertex_config: 
+  vertex_project: adroit-crow-413218
+  vertex_region: us-central1
+  vertex_credentials: /path/to/credentials.json
+```
+</TabItem>
+<TabItem value="env_var" label="Set in environment variables">
+
+```bash
+export DEFAULT_VERTEXAI_PROJECT="adroit-crow-413218"
+export DEFAULT_VERTEXAI_LOCATION="us-central1"
+export DEFAULT_GOOGLE_APPLICATION_CREDENTIALS="/path/to/credentials.json"
+```
+
+</TabItem>
+</Tabs>
+</TabItem>
+<TabItem value="client_credentials" label="Client Credentials">
+
+Try Gemini 2.0 Flash (curl)
+
+```
+MODEL_ID="gemini-2.0-flash-001"
+PROJECT_ID="YOUR_PROJECT_ID"
+```
+
+```bash
+curl \
+  -X POST \
+  -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
+  -H "Content-Type: application/json" \
+  "${LITELLM_PROXY_BASE_URL}/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/${MODEL_ID}:streamGenerateContent" -d \
+  $'{
+    "contents": {
+      "role": "user",
+      "parts": [
+        {
+        "fileData": {
+          "mimeType": "image/png",
+          "fileUri": "gs://generativeai-downloads/images/scones.jpg"
+          }
+        },
+        {
+          "text": "Describe this picture."
+        }
+      ]
+    }
+  }'
+```
+
+</TabItem>
+</Tabs>
+

 #### **Example Usage**

@ -22,7 +107,7 @@ Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE
 <TabItem value="curl" label="curl">

 ```bash
-curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \
+curl http://localhost:4000/vertex_ai/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/${MODEL_ID}:generateContent \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{
@ -101,7 +186,7 @@ litellm
 Let's call the Google AI Studio token counting endpoint

 ```bash
-curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \
+curl http://localhost:4000/vertex-ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
@ -140,7 +225,7 @@ LiteLLM Proxy Server supports two methods of authentication to Vertex AI:


 ```shell
-curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:generateContent \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:generateContent \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
@ -152,7 +237,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-0


 ```shell
-curl http://localhost:4000/vertex_ai/publishers/google/models/textembedding-gecko@001:predict \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/textembedding-gecko@001:predict \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{"instances":[{"content": "gm"}]}'
@ -162,7 +247,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/textembedding-geck
 ### Imagen API

 ```shell
-curl http://localhost:4000/vertex_ai/publishers/google/models/imagen-3.0-generate-001:predict \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/imagen-3.0-generate-001:predict \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}'
@ -172,7 +257,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/imagen-3.0-generat
 ### Count Tokens API

 ```shell
-curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:countTokens \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:countTokens \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
@ -183,7 +268,7 @@ Create Fine Tuning Job


 ```shell
-curl http://localhost:4000/vertex_ai/tuningJobs \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:tuningJobs \
      -H "Content-Type: application/json" \
      -H "x-litellm-api-key: Bearer sk-1234" \
      -d '{
@ -243,7 +328,7 @@ Expected Response


 ```bash
-curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -d '{
@ -268,7 +353,7 @@ tags: ["vertex-js-sdk", "pass-through-endpoint"]
 <TabItem value="curl" label="curl">

 ```bash
-curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \
  -H "Content-Type: application/json" \
  -H "x-litellm-api-key: Bearer sk-1234" \
  -H "tags: vertex-js-sdk,pass-through-endpoint" \
--- a/docs/my-website/docs/projects/PDL.md
+++ b/docs/my-website/docs/projects/PDL.md
@ -0,0 +1,5 @@
+PDL - A YAML-based approach to prompt programming
+
+Github: https://github.com/IBM/prompt-declaration-language
+
+PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.
--- a/docs/my-website/docs/projects/pgai.md
+++ b/docs/my-website/docs/projects/pgai.md
@ -0,0 +1,9 @@
+# pgai
+
+[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
+
+If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
+
+If you're already familiar with pgai, you can find litellm specific docs here:
+- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
+- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -821,6 +821,14 @@ print(f"\nResponse: {resp}")

 ## Usage - Thinking / `reasoning_content`

+LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298)
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
 <Tabs>
 <TabItem value="sdk" label="SDK">

@ -830,7 +838,7 @@ from litellm import completion
 resp = completion(
    model="anthropic/claude-3-7-sonnet-20250219",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
-    thinking={"type": "enabled", "budget_tokens": 1024},
+    reasoning_effort="low",
 )

 ```
@ -863,7 +871,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
  -d '{
    "model": "claude-3-7-sonnet-20250219",
    "messages": [{"role": "user", "content": "What is the capital of France?"}],
-    "thinking": {"type": "enabled", "budget_tokens": 1024}
+    "reasoning_effort": "low"
  }'
 ```

@ -927,6 +935,44 @@ ModelResponse(
 )
 ```

+### Pass `thinking` to Anthropic models
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = litellm.completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "anthropic/claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
 ## **Passing Extra Headers to Anthropic API**

 Pass `extra_headers: dict` to `litellm.completion`
@ -1035,8 +1081,10 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                       "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    }
                },
            ],
        }
@ -1081,8 +1129,10 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
+                "type": "file",
+                "file": {
+                    "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                }
            }
          }
        ]
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -291,14 +291,15 @@ response = completion(
 )
 ```

-## Azure O1 Models
+## O-Series Models

-| Model Name          | Function Call                                      |
-|---------------------|----------------------------------------------------|
-| o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
-| o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
+Azure OpenAI O-Series models are supported on LiteLLM. 

-Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support. 
+LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
+
+To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
+
+**Automatic Routing**

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -306,60 +307,112 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
 ```python
 import litellm

-litellm.enable_preview_features = True # 👈 KEY CHANGE
-
-response = litellm.completion(
-    model="azure/<your deployment name>",
-    messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
-    stream=True
-)
-
-for chunk in response:
-    print(chunk)
+litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
 ```
 </TabItem>
-<TabItem value="proxy" label="Proxy">
+<TabItem value="proxy" label="PROXY">

-1. Setup config.yaml
 ```yaml
 model_list:
-  - model_name: o1-mini
+  - model_name: o3-mini
    litellm_params:
-      model: azure/o1-mini
-      api_base: "os.environ/AZURE_API_BASE"
-      api_key: "os.environ/AZURE_API_KEY"
-      api_version: "os.environ/AZURE_API_VERSION"
-
-litellm_settings:
-    enable_preview_features: true # 👈 KEY CHANGE
+      model: azure/o3-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
 ```

-2. Start proxy 
+</TabItem>
+</Tabs>
+
+**Explicit Routing**
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+
+litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+model_list:
+  - model_name: o3-mini
+    litellm_params:
+      model: azure/o_series/my-random-deployment-name
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+```
+</TabItem>
+</Tabs>
+
+
+## Azure Audio Model
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["AZURE_API_KEY"] = ""
+os.environ["AZURE_API_BASE"] = ""
+os.environ["AZURE_API_VERSION"] = ""
+
+response = completion(
+    model="azure/azure-openai-4o-audio",
+    messages=[
+      {
+        "role": "user",
+        "content": "I want to try out speech to speech"
+      }
+    ],
+    modalities=["text","audio"],
+    audio={"voice": "alloy", "format": "wav"}
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: azure-openai-4o-audio
+    litellm_params:
+      model: azure/azure-openai-4o-audio
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: os.environ/AZURE_API_VERSION
+```
+
+2. Start proxy

 ```bash
 litellm --config /path/to/config.yaml
 ```

-3. Test it 
+3. Test it!

-```python
-import openai
-client = openai.OpenAI(
-    api_key="anything",
-    base_url="http://0.0.0.0:4000"
-)

-response = client.chat.completions.create(model="o1-mini", messages = [
-    {
-        "role": "user",
-        "content": "this is a test request, write a short poem"
-    }
-],
-stream=True)
-
-for chunk in response:
-    print(chunk)
+```bash
+curl http://localhost:4000/v1/chat/completions \
+  -H "Authorization: Bearer $LITELLM_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "azure-openai-4o-audio",
+    "messages": [{"role": "user", "content": "I want to try out speech to speech"}],
+    "modalities": ["text","audio"],
+    "audio": {"voice": "alloy", "format": "wav"}
+  }'
 ```
+
+
 </TabItem>
 </Tabs>

@ -948,62 +1001,9 @@ Expected Response:
 {"data":[{"id":"batch_R3V...}
 ```

-## O-Series Models

-Azure OpenAI O-Series models are supported on LiteLLM. 

-LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.

-To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
-
-**Automatic Routing**
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import litellm
-
-litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
-```
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-```yaml
-model_list:
-  - model_name: o3-mini
-    litellm_params:
-      model: azure/o3-model
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-```
-
-</TabItem>
-</Tabs>
-
-**Explicit Routing**
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import litellm
-
-litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
-```
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-```yaml
-model_list:
-  - model_name: o3-mini
-    litellm_params:
-      model: azure/o_series/my-random-deployment-name
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-```
-</TabItem>
-</Tabs>



@ -1076,32 +1076,24 @@ print(response)
 ```


-### Parallel Function calling
+### Tool Calling / Function Calling
+
 See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 # set Azure env variables
 import os
+import litellm
+import json
+
 os.environ['AZURE_API_KEY'] = "" # litellm reads AZURE_API_KEY from .env and sends the request
 os.environ['AZURE_API_BASE'] = "https://openai-gpt-4-test-v-1.openai.azure.com/"
 os.environ['AZURE_API_VERSION'] = "2023-07-01-preview"

-import litellm
-import json
-# Example dummy function hard coded to return the same weather
-# In production, this could be your backend API or an external API
-def get_current_weather(location, unit="fahrenheit"):
-    """Get the current weather in a given location"""
-    if "tokyo" in location.lower():
-        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
-    elif "san francisco" in location.lower():
-        return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"})
-    elif "paris" in location.lower():
-        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
-    else:
-        return json.dumps({"location": location, "temperature": "unknown"})
-
-## Step 1: send the conversation and available functions to the model
-messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
 tools = [
    {
        "type": "function",
@ -1125,7 +1117,7 @@ tools = [

 response = litellm.completion(
    model="azure/chatgpt-functioncalling", # model = azure/<your-azure-deployment-name>
-    messages=messages,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}],
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
 )
@ -1134,8 +1126,49 @@ response_message = response.choices[0].message
 tool_calls = response.choices[0].message.tool_calls
 print("\nTool Choice:\n", tool_calls)
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-functioncalling
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it
+
+```bash
+curl -L -X POST 'http://localhost:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "azure-gpt-3.5",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hey, how'\''s it going? Thinking long and hard before replying - what is the meaning of the world and life itself"
+        }
+    ]
+}'
+```


+
+
+</TabItem>
+</Tabs>
 ### Spend Tracking for Azure OpenAI Models (PROXY)

 Set base model for cost tracking azure image-gen call
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
--- a/docs/my-website/docs/providers/databricks.md
+++ b/docs/my-website/docs/providers/databricks.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 🆕 Databricks
+# Databricks

 LiteLLM supports all models on Databricks

@ -154,7 +154,205 @@ response = completion(
        temperature: 0.5
 ```

-## Passings Databricks specific params - 'instruction'
+
+## Usage - Thinking / `reasoning_content`
+
+LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298)
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+
+Known Limitations:
+- Support for passing thinking blocks back to Claude [Issue](https://github.com/BerriAI/litellm/issues/9790)
+ 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`)
+os.environ["DATABRICKS_API_KEY"] = "databricks key"
+os.environ["DATABRICKS_API_BASE"] = "databricks base url"
+
+resp = completion(
+    model="databricks/databricks-claude-3-7-sonnet",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet
+  litellm_params:
+    model: databricks/databricks-claude-3-7-sonnet
+    api_key: os.environ/DATABRICKS_API_KEY
+    api_base: os.environ/DATABRICKS_API_BASE
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+### Pass `thinking` to Anthropic models
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`)
+os.environ["DATABRICKS_API_KEY"] = "databricks key"
+os.environ["DATABRICKS_API_BASE"] = "databricks base url"
+
+response = litellm.completion(
+  model="databricks/databricks-claude-3-7-sonnet",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "databricks/databricks-claude-3-7-sonnet",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+## Supported Databricks Chat Completion Models 
+
+:::tip
+
+**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
+
+:::
+
+
+| Model Name                 | Command                                                          |
+|----------------------------|------------------------------------------------------------------|
+| databricks/databricks-claude-3-7-sonnet    | `completion(model='databricks/databricks/databricks-claude-3-7-sonnet', messages=messages)`   | 
+| databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
+| databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
+| databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
+| databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
+| databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
+| databricks-mixtral-8x7b-instruct    | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)`   | 
+| databricks-mpt-30b-instruct    | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)`   | 
+| databricks-mpt-7b-instruct    | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)`   | 
+
+
+## Embedding Models
+
+### Passing Databricks specific params - 'instruction'

 For embedding models, databricks lets you pass in an additional param 'instruction'. [Full Spec](https://github.com/BerriAI/litellm/blob/43353c28b341df0d9992b45c6ce464222ebd7984/litellm/llms/databricks.py#L164)

@ -187,27 +385,6 @@ response = litellm.embedding(
        instruction: "Represent this sentence for searching relevant passages:"
 ```

-
-## Supported Databricks Chat Completion Models 
-
-:::tip
-
-**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
-
-:::
-
-
-| Model Name                 | Command                                                          |
-|----------------------------|------------------------------------------------------------------|
-| databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
-| databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
-| databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
-| databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
-| databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
-| databricks-mixtral-8x7b-instruct    | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)`   | 
-| databricks-mpt-30b-instruct    | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)`   | 
-| databricks-mpt-7b-instruct    | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)`   | 
-
 ## Supported Databricks Embedding Models 

 :::tip
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -365,7 +365,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>

 ## Specifying Safety Settings 
-In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
+In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:

 ```python
 response = completion(
@ -438,6 +438,179 @@ assert isinstance(
 ```


+### Google Search Tool
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"googleSearch": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+### Google Search Retrieval
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"googleSearchRetrieval": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"googleSearchRetrieval": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+### Code Execution Tool
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"codeExecution": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"codeExecution": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+
 ## JSON Mode

 <Tabs>
@ -589,8 +762,10 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the audio."},
                {
-                    "type": "image_url",
-                    "image_url": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
+                    "type": "file",
+                    "file": {
+                        "file_data": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA
+                    }
                },
            ],
        }
@ -640,8 +815,11 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the file."},
                {
-                    "type": "image_url",
-                    "image_url": "https://storage..." # 👈 SET THE IMG URL
+                    "type": "file",
+                    "file": {
+                        "file_id": "https://storage...", # 👈 SET THE IMG URL
+                        "format": "application/pdf" # OPTIONAL
+                    }
                },
            ],
        }
@ -668,8 +846,11 @@ response = litellm.completion(
            "content": [
                {"type": "text", "text": "Please summarize the file."},
                {
-                    "type": "image_url",
-                    "image_url": "gs://..." # 👈 SET THE cloud storage bucket url
+                    "type": "file",
+                    "file": {
+                        "file_id": "gs://storage...", # 👈 SET THE IMG URL
+                        "format": "application/pdf" # OPTIONAL
+                    }
                },
            ],
        }
@ -879,3 +1060,54 @@ response = await client.chat.completions.create(

 </TabItem>
 </Tabs>
+
+## Image Generation
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion 
+
+response = completion(
+    model="gemini/gemini-2.0-flash-exp-image-generation",
+    messages=[{"role": "user", "content": "Generate an image of a cat"}],
+    modalities=["image", "text"],
+)
+assert response.choices[0].message.content is not None # "data:image/png;base64,e4rr.."
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash-exp-image-generation
+    litellm_params:
+      model: gemini/gemini-2.0-flash-exp-image-generation
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -L -X POST 'http://localhost:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gemini-2.0-flash-exp-image-generation",
+    "messages": [{"role": "user", "content": "Generate an image of a cat"}],
+    "modalities": ["image", "text"]
+}'
+```
+
+</TabItem>
+</Tabs>
+
--- a/docs/my-website/docs/providers/google_ai_studio/files.md
+++ b/docs/my-website/docs/providers/google_ai_studio/files.md
@ -0,0 +1,161 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [BETA] Google AI Studio (Gemini) Files API
+
+Use this to upload files to Google AI Studio (Gemini).
+
+Useful to pass in large media files to Gemini's `/generateContent` endpoint.
+
+| Action | Supported | 
+|----------|-----------|
+| `create` | Yes |
+| `delete` | No |
+| `retrieve` | No |
+| `list` | No |
+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import base64
+import requests
+from litellm import completion, create_file
+import os
+
+
+### UPLOAD FILE ### 
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode('utf-8')
+
+
+file = create_file(
+    file=wav_data,
+    purpose="user_data",
+    extra_body={"custom_llm_provider": "gemini"},
+    api_key=os.getenv("GEMINI_API_KEY"),
+)
+
+print(f"file: {file}")
+
+assert file is not None
+
+
+### GENERATE CONTENT ### 
+completion = completion(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                { 
+                    "type": "text",
+                    "text": "What is in this recording?"
+                },
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                        "filename": "my-test-name",
+                        "format": "audio/wav"
+                    }
+                }
+            ]
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: "gemini-2.0-flash"
+      litellm_params:
+        model: gemini/gemini-2.0-flash
+        api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it
+
+```python
+import base64
+import requests
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234"
+)
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode('utf-8')
+
+
+file = client.files.create(
+    file=wav_data,
+    purpose="user_data",
+    extra_body={"target_model_names": "gemini-2.0-flash"}
+)
+
+print(f"file: {file}")
+
+assert file is not None
+
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    modalities=["text", "audio"],
+    audio={"voice": "alloy", "format": "wav"},
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                { 
+                    "type": "text",
+                    "text": "What is in this recording?"
+                },
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                        "filename": "my-test-name",
+                        "format": "audio/wav"
+                    }
+                }
+            ]
+        },
+    ],
+    extra_body={"drop_params": True}
+)
+
+print(completion.choices[0].message)
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
--- a/docs/my-website/docs/providers/huggingface.md
+++ b/docs/my-website/docs/providers/huggingface.md
@ -2,466 +2,392 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Huggingface
+# Hugging Face
+LiteLLM supports running inference across multiple services for models hosted on the Hugging Face Hub.

-LiteLLM supports the following types of Hugging Face models:
+- **Serverless Inference Providers** - Hugging Face offers an easy and unified access to serverless AI inference through multiple inference providers, like [Together AI](https://together.ai) and [Sambanova](https://sambanova.ai). This is the fastest way to integrate AI in your products with a maintenance-free and scalable solution. More details in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index).
+- **Dedicated Inference Endpoints** - which is a product to easily deploy models to production. Inference is run by Hugging Face in a dedicated, fully managed infrastructure on a cloud provider of your choice. You can deploy your model on Hugging Face Inference Endpoints by following [these steps](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint).

- Serverless Inference API (free) - loaded and ready to use: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation
- Dedicated Inference Endpoints (paid) - manual deployment: https://ui.endpoints.huggingface.co/
- All LLMs served via Hugging Face's Inference use [Text-generation-inference](https://huggingface.co/docs/text-generation-inference). 
+
+## Supported Models
+
+### Serverless Inference Providers
+You can check available models for an inference provider by going to [huggingface.co/models](https://huggingface.co/models), clicking the "Other" filter tab, and selecting your desired provider:
+
+![Filter models by Inference Provider](../../img/hf_filter_inference_providers.png)
+
+For example, you can find all Fireworks supported models [here](https://huggingface.co/models?inference_provider=fireworks-ai&sort=trending).
+
+
+### Dedicated Inference Endpoints
+Refer to the [Inference Endpoints catalog](https://endpoints.huggingface.co/catalog) for a list of available models.

 ## Usage

+<Tabs>
+<TabItem value="serverless" label="Serverless Inference Providers">
+
+### Authentication
+With a single Hugging Face token, you can access inference through multiple providers. Your calls are routed through Hugging Face and the usage is billed directly to your Hugging Face account at the standard provider API rates.
+
+Simply set the `HF_TOKEN` environment variable with your Hugging Face token, you can create one here: https://huggingface.co/settings/tokens.
+
+```bash
+export HF_TOKEN="hf_xxxxxx"
+```
+or alternatively, you can pass your Hugging Face token as a parameter:
+```python
+completion(..., api_key="hf_xxxxxx")
+```
+
+### Getting Started
+
+To use a Hugging Face model, specify both the provider and model you want to use in the following format:
+```
+huggingface/<provider>/<hf_org_or_user>/<hf_model>
+```
+Where `<hf_org_or_user>/<hf_model>` is the Hugging Face model ID and `<provider>` is the inference provider.  
+By default, if you don't specify a provider, LiteLLM will use the [HF Inference API](https://huggingface.co/docs/api-inference/en/index).
+
+Examples:
+
+```python
+# Run DeepSeek-R1 inference through Together AI
+completion(model="huggingface/together/deepseek-ai/DeepSeek-R1",...)
+
+# Run Qwen2.5-72B-Instruct inference through Sambanova
+completion(model="huggingface/sambanova/Qwen/Qwen2.5-72B-Instruct",...)
+
+# Run Llama-3.3-70B-Instruct inference through HF Inference API
+completion(model="huggingface/meta-llama/Llama-3.3-70B-Instruct",...)
+```
+
+
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

-You need to tell LiteLLM when you're calling Huggingface.
-This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
-
-<Tabs>
-<TabItem value="serverless" label="Serverless Inference API">
-
-By default, LiteLLM will assume a Hugging Face call follows the [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api), which is fully compatible with the OpenAI Chat Completion API.
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
+### Basic Completion
+Here's an example of chat completion using the DeepSeek-R1 model through Together AI:

 ```python
 import os
 from litellm import completion

-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
+os.environ["HF_TOKEN"] = "hf_xxxxxx"

-messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
-
-# e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API
 response = completion(
-    model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
+    model="huggingface/together/deepseek-ai/DeepSeek-R1",
+    messages=[
+        {
+            "role": "user",
+            "content": "How many r's are in the word 'strawberry'?",
+        }
+    ],
+)
+print(response)
+```
+
+### Streaming
+Now, let's see what a streaming request looks like.
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/together/deepseek-ai/DeepSeek-R1",
+    messages=[
+        {
+            "role": "user",
+            "content": "How many r's are in the word `strawberry`?",
+            
+        }
+    ],
+    stream=True,
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+### Image Input
+You can also pass images when the model supports it. Here is an example using [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) model through Sambanova.
+
+```python
+from litellm import completion
+
+# Set your Hugging Face Token
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    }
+                },
+            ],
+        }
+    ]
+
+response = completion(
+    model="huggingface/sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct", 
+    messages=messages,
+)
+print(response.choices[0])
+```
+
+### Function Calling
+You can extend the model's capabilities by giving them access to tools. Here is an example with function calling using [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) model through Sambanova.
+
+```python
+import os
+from litellm import completion
+
+# Set your Hugging Face Token
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather in a given location",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+          },
+          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+      },
+    }
+  }
+]
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in Boston today?",
+    }
+]
+
+response = completion(
+    model="huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct", 
+    messages=messages,
+    tools=tools,
+    tool_choice="auto"
+)
+print(response)
+```
+
+</TabItem>
+
+<TabItem value="endpoints" label="Inference Endpoints">
+
+<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
+
+### Basic Completion
+After you have [deployed your Hugging Face Inference Endpoint](https://endpoints.huggingface.co/new) on dedicated infrastructure, you can run inference on it by providing the endpoint base URL in `api_base`, and indicating `huggingface/tgi` as the model name.
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/"
+)
+print(response)
+```
+
+### Streaming
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
    stream=True
 )

-print(response)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: llama-3.1-8B-instruct
-    litellm_params:
-      model: huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct
-      api_key: os.environ/HUGGINGFACE_API_KEY
-```
-
-2. Start the proxy
-
-```bash
-$ litellm --config /path/to/config.yaml --debug
-```
-
-3. Test it!
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "llama-3.1-8B-instruct",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
-```
-
-</TabItem> 
-</Tabs>
-</TabItem>
-<TabItem value="classification" label="Text Classification">
-
-Append `text-classification` to the model name
-
-e.g. `huggingface/text-classification/<model-name>`
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import os
-from litellm import completion
-
-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
-
-messages = [{ "content": "I like you, I love you!","role": "user"}]
-
-# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints
-response = completion(
-  model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier",
-  messages=messages,
-  api_base="https://my-endpoint.endpoints.huggingface.cloud",
-)
-
-print(response)
-```
-
-</TabItem> 
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: bert-classifier
-    litellm_params:
-      model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier
-      api_key: os.environ/HUGGINGFACE_API_KEY
-      api_base: "https://my-endpoint.endpoints.huggingface.cloud"
-```
-
-2. Start the proxy
-
-```bash
-$ litellm --config /path/to/config.yaml --debug
-```
-
-3. Test it!
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "bert-classifier",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
-```
-
-</TabItem> 
-</Tabs>
-</TabItem>
-<TabItem value="dedicated" label="Dedicated Inference Endpoints">
-
-Steps to use
-* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/
-* Set `api_base` to your deployed api base
-* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import os
-from litellm import completion
-
-os.environ["HUGGINGFACE_API_KEY"] = ""
-
-# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b
-# add the 'huggingface/' prefix to the model to set huggingface as the provider
-# set api base to your deployed api endpoint from hugging face
-response = completion(
-    model="huggingface/glaiveai/glaive-coder-7b",
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
-)
-print(response)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: glaive-coder
-    litellm_params:
-      model: huggingface/glaiveai/glaive-coder-7b
-      api_key: os.environ/HUGGINGFACE_API_KEY
-      api_base: "https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
-```
-
-2. Start the proxy
-
-```bash
-$ litellm --config /path/to/config.yaml --debug
-```
-
-3. Test it!
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "glaive-coder",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
-```
-
-</TabItem> 
-</Tabs>
-
-</TabItem>
-</Tabs>
-
-## Streaming
-
-<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
-  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
-</a>
-
-You need to tell LiteLLM when you're calling Huggingface.
-This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
-
-```python
-import os
-from litellm import completion
-
-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
-
-messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
-
-# e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints
-response = completion(
-  model="huggingface/facebook/blenderbot-400M-distill",
-  messages=messages,
-  api_base="https://my-endpoint.huggingface.cloud",
-  stream=True
-)
-
-print(response)
 for chunk in response:
-  print(chunk)
+    print(chunk)
 ```

+### Image Input
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    }
+                },
+            ],
+        }
+    ]
+response = completion(
+    model="huggingface/tgi",
+    messages=messages,
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/""
+)
+print(response.choices[0])
+```
+
+### Function Calling
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+functions = [{
+    "name": "get_weather",
+    "description": "Get the weather in a given location",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "location": {
+                "type": "string",
+                "description": "The location to get weather for"
+            }
+        },
+        "required": ["location"]
+    }
+}]
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "What's the weather like in San Francisco?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
+    functions=functions
+)
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+## LiteLLM Proxy Server with Hugging Face models
+You can set up a [LiteLLM Proxy Server](https://docs.litellm.ai/#litellm-proxy-server-llm-gateway) to serve Hugging Face models through any of the supported Inference Providers. Here's how to do it:
+
+### Step 1. Setup the config file
+
+In this case, we are configuring a proxy to serve `DeepSeek R1` from Hugging Face, using Together AI as the backend Inference Provider.
+
+```yaml
+model_list:
+  - model_name: my-r1-model
+    litellm_params:
+      model: huggingface/together/deepseek-ai/DeepSeek-R1
+      api_key: os.environ/HF_TOKEN # ensure you have `HF_TOKEN` in your .env
+```
+
+### Step 2. Start the server
+```bash
+litellm --config /path/to/config.yaml
+```
+
+### Step 3. Make a request to the server
+<Tabs>
+<TabItem value="curl" label="curl">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "my-r1-model",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hello, how are you?"
+        }
+    ]
+}'
+```
+
+</TabItem>
+<TabItem value="python" label="python">
+
+```python
+# pip install openai
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://0.0.0.0:4000",
+    api_key="anything",
+)
+
+response = client.chat.completions.create(
+    model="my-r1-model",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ]
+)
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+
 ## Embedding

-LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) format.
+LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) models as well.

 ```python
 from litellm import embedding
 import os
-os.environ['HUGGINGFACE_API_KEY'] = ""
+os.environ['HF_TOKEN'] = "hf_xxxxxx"
 response = embedding(
    model='huggingface/microsoft/codebert-base',
    input=["good morning from litellm"]
 )
 ```

-## Advanced
-
-### Setting API KEYS + API BASE
-
-If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25)
-
-```python
-import os
-os.environ["HUGGINGFACE_API_KEY"] = ""
-os.environ["HUGGINGFACE_API_BASE"] = ""
-```
-
-### Viewing Log probs
-
-#### Using `decoder_input_details` - OpenAI `echo`
-
-The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this
-
-```python
-from litellm import text_completion
-response = text_completion(
-    model="huggingface/bigcode/starcoder",
-    prompt="good morning",
-    max_tokens=10, logprobs=10,
-    echo=True
-)
-```
-
-#### Output
-
-```json
-{
-  "id": "chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad",
-  "object": "text_completion",
-  "created": 1698801125.936519,
-  "model": "bigcode/starcoder",
-  "choices": [
-    {
-      "text": ", I'm going to make you a sand",
-      "index": 0,
-      "logprobs": {
-        "tokens": [
-          "good",
-          " morning",
-          ",",
-          " I",
-          "'m",
-          " going",
-          " to",
-          " make",
-          " you",
-          " a",
-          " s",
-          "and"
-        ],
-        "token_logprobs": [
-          "None",
-          -14.96875,
-          -2.2285156,
-          -2.734375,
-          -2.0957031,
-          -2.0917969,
-          -0.09429932,
-          -3.1132812,
-          -1.3203125,
-          -1.2304688,
-          -1.6201172,
-          -0.010292053
-        ]
-      },
-      "finish_reason": "length"
-    }
-  ],
-  "usage": {
-    "completion_tokens": 9,
-    "prompt_tokens": 2,
-    "total_tokens": 11
-  }
-}
-```
-
-### Models with Prompt Formatting
-
-For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template.
-
-#### Models with natively Supported Prompt Templates
-
-| Model Name                           | Works for Models                   | Function Call                                                                                                           | Required OS Variables               |
-| ------------------------------------ | ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
-| mistralai/Mistral-7B-Instruct-v0.1   | mistralai/Mistral-7B-Instruct-v0.1 | `completion(model='huggingface/mistralai/Mistral-7B-Instruct-v0.1', messages=messages, api_base="your_api_endpoint")`   | `os.environ['HUGGINGFACE_API_KEY']` |
-| meta-llama/Llama-2-7b-chat           | All meta-llama llama2 chat models  | `completion(model='huggingface/meta-llama/Llama-2-7b', messages=messages, api_base="your_api_endpoint")`                | `os.environ['HUGGINGFACE_API_KEY']` |
-| tiiuae/falcon-7b-instruct            | All falcon instruct models         | `completion(model='huggingface/tiiuae/falcon-7b-instruct', messages=messages, api_base="your_api_endpoint")`            | `os.environ['HUGGINGFACE_API_KEY']` |
-| mosaicml/mpt-7b-chat                 | All mpt chat models                | `completion(model='huggingface/mosaicml/mpt-7b-chat', messages=messages, api_base="your_api_endpoint")`                 | `os.environ['HUGGINGFACE_API_KEY']` |
-| codellama/CodeLlama-34b-Instruct-hf  | All codellama instruct models      | `completion(model='huggingface/codellama/CodeLlama-34b-Instruct-hf', messages=messages, api_base="your_api_endpoint")`  | `os.environ['HUGGINGFACE_API_KEY']` |
-| WizardLM/WizardCoder-Python-34B-V1.0 | All wizardcoder models             | `completion(model='huggingface/WizardLM/WizardCoder-Python-34B-V1.0', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
-| Phind/Phind-CodeLlama-34B-v2         | All phind-codellama models         | `completion(model='huggingface/Phind/Phind-CodeLlama-34B-v2', messages=messages, api_base="your_api_endpoint")`         | `os.environ['HUGGINGFACE_API_KEY']` |
-
-**What if we don't support a model you need?**
-You can also specify you're own custom prompt formatting, in case we don't have your model covered yet.
-
-**Does this mean you have to specify a prompt for all models?**
-No. By default we'll concatenate your message content to make a prompt.
-
-**Default Prompt Template**
-
-```python
-def default_pt(messages):
-    return " ".join(message["content"] for message in messages)
-```
-
-[Code for how prompt formats work in LiteLLM](https://github.com/BerriAI/litellm/blob/main/litellm/llms/prompt_templates/factory.py)
-
-#### Custom prompt templates
-
-```python
-import litellm
-
-# Create your own custom prompt template works
-litellm.register_prompt_template(
-	    model="togethercomputer/LLaMA-2-7B-32K",
-	    roles={
-            "system": {
-                "pre_message": "[INST] <<SYS>>\n",
-                "post_message": "\n<</SYS>>\n [/INST]\n"
-            },
-            "user": {
-                "pre_message": "[INST] ",
-                "post_message": " [/INST]\n"
-            },
-            "assistant": {
-                "post_message": "\n"
-            }
-        }
-    )
-
-def test_huggingface_custom_model():
-    model = "huggingface/togethercomputer/LLaMA-2-7B-32K"
-    response = completion(model=model, messages=messages, api_base="https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud")
-    print(response['choices'][0]['message']['content'])
-    return response
-
-test_huggingface_custom_model()
-```
-
-[Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52)
-
-### Deploying a model on huggingface
-
-You can use any chat/text model from Hugging Face with the following steps:
-
- Copy your model id/url from Huggingface Inference Endpoints
-  - [ ] Go to https://ui.endpoints.huggingface.co/
-  - [ ] Copy the url of the specific model you'd like to use
-        <Image img={require('../../img/hf_inference_endpoint.png')} alt="HF_Dashboard" style={{ maxWidth: '50%', height: 'auto' }}/>
- Set it as your model name
- Set your HUGGINGFACE_API_KEY as an environment variable
-
-Need help deploying a model on huggingface? [Check out this guide.](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint)
-
-# output
-
-Same as the OpenAI format, but also includes logprobs. [See the code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/llms/huggingface_restapi.py#L115)
-
-```json
-{
-  "choices": [
-    {
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content": "\ud83d\ude31\n\nComment: @SarahSzabo I'm",
-        "role": "assistant",
-        "logprobs": -22.697942825499993
-      }
-    }
-  ],
-  "created": 1693436637.38206,
-  "model": "https://ji16r2iys9a8rjk2.us-east-1.aws.endpoints.huggingface.cloud",
-  "usage": {
-    "prompt_tokens": 14,
-    "completion_tokens": 11,
-    "total_tokens": 25
-  }
-}
-```
-
 # FAQ

-**Does this support stop sequences?**
+**How does billing work with Hugging Face Inference Providers?**

-Yes, we support stop sequences - and you can pass as many as allowed by Hugging Face (or any provider!)
+> Billing is centralized on your Hugging Face account, no matter which providers you are using. You are billed the standard provider API rates with no additional markup - Hugging Face simply passes through the provider costs. Note that [Hugging Face PRO](https://huggingface.co/subscribe/pro) users get $2 worth of Inference credits every month that can be used across providers.

-**How do you deal with repetition penalty?**
+**Do I need to create an account for each Inference Provider?**

-We map the presence penalty parameter in openai to the repetition penalty parameter on Hugging Face. [See code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/utils.py#L757).
+> No, you don't need to create separate accounts. All requests are routed through Hugging Face, so you only need your HF token. This allows you to easily benchmark different providers and choose the one that best fits your needs.

-We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)!
+**Will more inference providers be supported by Hugging Face in the future?**
+
+> Yes! New inference providers (and models) are being added gradually.
+
+We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)!
--- a/docs/my-website/docs/providers/litellm_proxy.md
+++ b/docs/my-website/docs/providers/litellm_proxy.md
@ -57,7 +57,7 @@ messages = [{ "content": "Hello, how are you?","role": "user"}]
 # litellm proxy call
 response = completion(
    model="litellm_proxy/your-model-name", 
-    messages, 
+    messages=messages, 
    api_base = "your-litellm-proxy-url",
    api_key = "your-litellm-proxy-api-key"
 )
@ -76,7 +76,7 @@ messages = [{ "content": "Hello, how are you?","role": "user"}]
 # openai call
 response = completion(
    model="litellm_proxy/your-model-name", 
-    messages, 
+    messages=messages,
    api_base = "your-litellm-proxy-url", 
    stream=True
 )
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>

+
+## Using Ollama FIM on `/v1/completions`
+
+LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests. 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm 
+litellm._turn_on_debug() # turn on debug to see the request
+from litellm import completion
+
+response = completion(
+    model="ollama/llama3.1",
+    prompt="Hello, world!",
+    api_base="http://localhost:11434"
+)
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: "llama3.1"             
+    litellm_params:
+      model: "ollama/llama3.1"
+      api_base: "http://localhost:11434"
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml --detailed_debug
+
+# RUNNING ON http://0.0.0.0:4000 
+```
+
+3. Test it! 
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
+    base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
+)
+
+response = client.completions.create(
+    model="ollama/llama3.1",
+    prompt="Hello, world!",
+    api_base="http://localhost:11434"
+)
+print(response)
+```
+</TabItem>
+</Tabs>
+
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -228,6 +228,92 @@ response = completion(

 ```

+## PDF File Parsing
+
+OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import base64
+from litellm import completion
+
+with open("draconomicon.pdf", "rb") as f:
+    data = f.read()
+
+base64_string = base64.b64encode(data).decode("utf-8")
+
+completion = completion(
+    model="gpt-4o",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "file",
+                    "file": {
+                        "filename": "draconomicon.pdf",
+                        "file_data": f"data:application/pdf;base64,{base64_string}",
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What is the first dragon in the book?",
+                }
+            ],
+        },
+    ],
+)
+
+print(completion.choices[0].message.content)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: openai-model
+    litellm_params:
+      model: gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{ 
+    "model": "openai-model",
+    "messages": [
+        {"role": "user", "content": [
+            {
+                "type": "file",
+                "file": {
+                    "filename": "draconomicon.pdf",
+                    "file_data": f"data:application/pdf;base64,{base64_string}",
+                }
+            }
+        ]}
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
 ## OpenAI Fine Tuned Models

 | Model Name                | Function Call                                                          |
@ -239,6 +325,74 @@ response = completion(
 | fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |


+## OpenAI Audio Transcription
+
+LiteLLM supports OpenAI Audio Transcription endpoint.
+
+Supported models:
+
+| Model Name                | Function Call                                                          |
+|---------------------------|-----------------------------------------------------------------|
+| `whisper-1`    | `response = completion(model="whisper-1", file=audio_file)`     |
+| `gpt-4o-transcribe` | `response = completion(model="gpt-4o-transcribe", file=audio_file)` |
+| `gpt-4o-mini-transcribe` | `response = completion(model="gpt-4o-mini-transcribe", file=audio_file)` |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import transcription
+import os 
+
+# set api keys 
+os.environ["OPENAI_API_KEY"] = ""
+audio_file = open("/path/to/audio.mp3", "rb")
+
+response = transcription(model="gpt-4o-transcribe", file=audio_file)
+
+print(f"response: {response}")
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+- model_name: gpt-4o-transcribe
+  litellm_params:
+    model: gpt-4o-transcribe
+    api_key: os.environ/OPENAI_API_KEY
+  model_info:
+    mode: audio_transcription
+    
+general_settings:
+  master_key: sk-1234
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it!
+
+```bash
+curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
+--header 'Authorization: Bearer sk-1234' \
+--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
+--form 'model="gpt-4o-transcribe"'
+```
+
+
+
+</TabItem>
+</Tabs>
+
+
+
 ## Advanced

 ### Getting OpenAI API Response Headers 
@ -449,26 +603,6 @@ response = litellm.acompletion(
 )
 ```

-### Using Helicone Proxy with LiteLLM
-```python
-import os 
-import litellm
-from litellm import completion
-
-os.environ["OPENAI_API_KEY"] = ""
-
-# os.environ["OPENAI_API_BASE"] = ""
-litellm.api_base = "https://oai.hconeai.com/v1"
-litellm.headers = {
-    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
-    "Helicone-Cache-Enabled": "true",
-}
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-# openai call
-response = completion("gpt-3.5-turbo", messages)
-```

 ### Using OpenAI Proxy with LiteLLM
 ```python
--- a/docs/my-website/docs/providers/openrouter.md
+++ b/docs/my-website/docs/providers/openrouter.md
@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
 import os
 from litellm import completion
 os.environ["OPENROUTER_API_KEY"] = ""
+os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1

-os.environ["OR_SITE_URL"] = "" # optional
-os.environ["OR_APP_NAME"] = "" # optional
+
+os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
+os.environ["OR_APP_NAME"] = "" # [OPTIONAL]

 response = completion(
            model="openrouter/google/palm-2-chat-bison",
--- a/docs/my-website/docs/providers/perplexity.md
+++ b/docs/my-website/docs/providers/perplexity.md
@ -17,7 +17,7 @@ import os

 os.environ['PERPLEXITYAI_API_KEY'] = ""
 response = completion(
-    model="perplexity/mistral-7b-instruct", 
+    model="perplexity/sonar-pro", 
    messages=messages
 )
 print(response)
@ -30,7 +30,7 @@ import os

 os.environ['PERPLEXITYAI_API_KEY'] = ""
 response = completion(
-    model="perplexity/mistral-7b-instruct", 
+    model="perplexity/sonar-pro", 
    messages=messages,
    stream=True
 )
@ -45,19 +45,12 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| pplx-7b-chat | `completion(model="perplexity/pplx-7b-chat", messages)` | 
-| pplx-70b-chat | `completion(model="perplexity/pplx-70b-chat", messages)` | 
-| pplx-7b-online | `completion(model="perplexity/pplx-7b-online", messages)` | 
-| pplx-70b-online | `completion(model="perplexity/pplx-70b-online", messages)` | 
-| codellama-34b-instruct | `completion(model="perplexity/codellama-34b-instruct", messages)` | 
-| llama-2-13b-chat | `completion(model="perplexity/llama-2-13b-chat", messages)` | 
-| llama-2-70b-chat | `completion(model="perplexity/llama-2-70b-chat", messages)` | 
-| mistral-7b-instruct | `completion(model="perplexity/mistral-7b-instruct", messages)` | 
-| openhermes-2-mistral-7b | `completion(model="perplexity/openhermes-2-mistral-7b", messages)` | 
-| openhermes-2.5-mistral-7b | `completion(model="perplexity/openhermes-2.5-mistral-7b", messages)` | 
-| pplx-7b-chat-alpha | `completion(model="perplexity/pplx-7b-chat-alpha", messages)` | 
-| pplx-70b-chat-alpha | `completion(model="perplexity/pplx-70b-chat-alpha", messages)` | 
-
+| sonar-deep-research | `completion(model="perplexity/sonar-deep-research", messages)` | 
+| sonar-reasoning-pro | `completion(model="perplexity/sonar-reasoning-pro", messages)` | 
+| sonar-reasoning | `completion(model="perplexity/sonar-reasoning", messages)` | 
+| sonar-pro | `completion(model="perplexity/sonar-pro", messages)` | 
+| sonar | `completion(model="perplexity/sonar", messages)` | 
+| r1-1776 | `completion(model="perplexity/r1-1776", messages)` | 



--- a/docs/my-website/docs/providers/predibase.md
+++ b/docs/my-website/docs/providers/predibase.md
@ -230,7 +230,7 @@ response = completion(
    model="predibase/llama-3-8b-instruct", 
    messages = [{ "content": "Hello, how are you?","role": "user"}],
    adapter_id="my_repo/3",
-    adapter_soruce="pbase",
+    adapter_source="pbase",
 )
 ```

--- a/docs/my-website/docs/providers/snowflake.md
+++ b/docs/my-website/docs/providers/snowflake.md
@ -0,0 +1,90 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+
+# Snowflake
+| Property | Details |
+|-------|-------|
+| Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE function via HTTP POST requests|
+| Provider Route on LiteLLM | `snowflake/` |
+| Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) |
+| Base URL | [https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete/](https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete) |
+| Supported OpenAI Endpoints | `/chat/completions`, `/completions` |
+
+
+
+Currently, Snowflake's REST API does not have an endpoint for `snowflake-arctic-embed` embedding models. If you want to use these embedding models with Litellm, you can call them through our Hugging Face provider. 
+
+Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake/arctic-embed-661fd57d50fab5fc314e4c18) on Hugging Face.
+
+## Supported OpenAI Parameters
+```
+    "temperature",
+    "max_tokens",
+    "top_p",
+    "response_format"
+```
+
+## API KEYS
+
+Snowflake does have API keys. Instead, you access the Snowflake API with your JWT token and account identifier.
+
+```python
+import os 
+os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
+os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
+```
+## Usage
+
+```python
+from litellm import completion
+
+## set ENV variables
+os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
+os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
+
+# Snowflake call
+response = completion(
+    model="snowflake/mistral-7b", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+## Usage with LiteLLM Proxy 
+
+#### 1. Required env variables
+```bash
+export SNOWFLAKE_JWT=""
+export SNOWFLAKE_ACCOUNT_ID = ""
+```
+
+#### 2. Start the proxy~
+```yaml
+model_list:
+  - model_name: mistral-7b
+    litellm_params:
+        model: snowflake/mistral-7b
+        api_key: YOUR_API_KEY
+        api_base: https://YOUR-ACCOUNT-ID.snowflakecomputing.com/api/v2/cortex/inference:complete
+
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+#### 3. Test it
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "snowflake/mistral-7b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, how are you?"
+        }
+      ]
+    }
+'
+```
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -398,20 +398,24 @@ curl http://localhost:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+You can also use the `enterpriseWebSearch` tool for an [enterprise compliant search](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/web-grounding-enterprise).
+
 #### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)**


 If this was your initial VertexAI Grounding code,

 ```python
-import vertexai 
+import vertexai
+from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
+

 vertexai.init(project=project_id, location="us-central1")

 model = GenerativeModel("gemini-1.5-flash-001")

 # Use Google Search for grounding
-tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
+tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
@ -428,7 +432,7 @@ print(response)
 then, this is what it looks like now

 ```python
-from litellm import completion 
+from litellm import completion


 # !gcloud auth application-default login - run this to add vertex credentials to your env
@ -1367,6 +1371,103 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>


+## Gemini Pro
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
+
+## Fine-tuned Models
+
+You can call fine-tuned Vertex AI Gemini models through LiteLLM
+
+| Property | Details |
+|----------|---------|
+| Provider Route | `vertex_ai/gemini/{MODEL_ID}` |
+| Vertex Documentation | [Vertex AI - Fine-tuned Gemini Models](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-use-supervised-tuning#test_the_tuned_model_with_a_prompt)|
+| Supported Operations | `/chat/completions`, `/completions`, `/embeddings`, `/images` |
+
+To use a model that follows the `/gemini` request/response format, simply set the model parameter as 
+
+```python title="Model parameter for calling fine-tuned gemini models"
+model="vertex_ai/gemini/<your-finetuned-model>"
+```
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM Python SDK">
+
+```python showLineNumbers title="Example"
+import litellm
+import os
+
+## set ENV variables
+os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+response = litellm.completion(
+  model="vertex_ai/gemini/<your-finetuned-model>",  # e.g. vertex_ai/gemini/4965075652664360960
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">
+
+1. Add Vertex Credentials to your env 
+
+```bash title="Authenticate to Vertex AI"
+!gcloud auth application-default login
+```
+
+2. Setup config.yaml 
+
+```yaml showLineNumbers title="Add to litellm config"
+- model_name: finetuned-gemini
+  litellm_params:
+    model: vertex_ai/gemini/<ENDPOINT_ID>
+    vertex_project: <PROJECT_ID>
+    vertex_location: <LOCATION>
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers title="Example request"
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="your-litellm-key",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="finetuned-gemini",
+    messages=[
+        {"role": "user", "content": "hi"}
+    ]
+)
+print(response)
+```
+
+</TabItem>
+<TabItem value="curl" label="curl">
+
+```bash showLineNumbers title="Example request"
+curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: <LITELLM_KEY>' \
+--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
+```
+
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+
+
 ## Model Garden

 :::tip
@ -1477,67 +1578,6 @@ response = completion(
 </Tabs>


-## Gemini Pro
-| Model Name       | Function Call                        |
-|------------------|--------------------------------------|
-| gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
-
-## Fine-tuned Models
-
-Fine tuned models on vertex have a numerical model/endpoint id. 
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import completion
-import os
-
-## set ENV variables
-os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
-os.environ["VERTEXAI_LOCATION"] = "us-central1"
-
-response = completion(
-  model="vertex_ai/<your-finetuned-model>",  # e.g. vertex_ai/4965075652664360960
-  messages=[{ "content": "Hello, how are you?","role": "user"}],
-  base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add Vertex Credentials to your env 
-
-```bash
-!gcloud auth application-default login
-```
-
-2. Setup config.yaml 
-
-```yaml
- model_name: finetuned-gemini
-  litellm_params:
-    model: vertex_ai/<ENDPOINT_ID>
-    vertex_project: <PROJECT_ID>
-    vertex_location: <LOCATION>
-  model_info:
-    base_model: vertex_ai/gemini-1.5-pro # IMPORTANT
-```
-
-3. Test it! 
-
-```bash
-curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: <LITELLM_KEY>' \
--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
-```
-
-</TabItem>
-</Tabs>
-
-

 ## Gemini Pro Vision
 | Model Name       | Function Call                        |
@ -1682,15 +1722,25 @@ assert isinstance(
 ```


-## Usage - PDF / Videos / etc. Files 
+## Usage - PDF / Videos / Audio etc. Files 

 Pass any file supported by Vertex AI, through LiteLLM. 

+LiteLLM Supports the following file types passed in url. 
+
+Using `file` message type for VertexAI is live from v1.65.1+ 
+
+```
+Files with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
+Files with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
+Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
+Base64 Encoded Local Files
+```

 <Tabs>
 <TabItem value="sdk" label="SDK">

-### **Using `gs://`**
+### **Using `gs://` or any URL**
 ```python
 from litellm import completion

@ -1702,8 +1752,11 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                        "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
+                        "format": "application/pdf" # OPTIONAL - specify mime-type
+                    }
                },
            ],
        }
@ -1737,8 +1790,16 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                        "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    }  
+                },
+                {
+                    "type": "audio_input",
+                    "audio_input {
+                        "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
+                    }  
                },
            ],
        }
@ -1784,8 +1845,11 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf" # 👈 PDF
+                "type": "file",
+                "file": {
+                    "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
+                    "format": "application/pdf" # OPTIONAL
+                }
            }
          }
        ]
@ -1812,11 +1876,18 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
-            }
-          }
-        ]
+                "type": "file",
+                "file": {
+                    "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                },
+            },
+            {
+                "type": "audio_input",
+                "audio_input {
+                    "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too)
+                }  
+            },
+    ]
      }
    ],
    "max_tokens": 300
@ -1826,6 +1897,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+
 ## Chat Models
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
@ -2034,7 +2106,12 @@ print(response)

 ## **Multi-Modal Embeddings**

-Usage
+
+Known Limitations:
+- Only supports 1 image / video / image per request
+- Only supports GCS or base64 encoded images / videos
+
+### Usage

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -2250,6 +2327,115 @@ print(f"Text Embedding: {embeddings.text_embedding}")
 </Tabs>


+### Text + Image + Video Embeddings
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+Text + Image 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"] # will be sent as a gcs image
+)
+```
+
+Text + Video 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
+)
+```
+
+Image + Video 
+
+```python
+response = await litellm.aembedding(
+    model="vertex_ai/multimodalembedding@001",
+    input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image
+)
+```
+
+
+</TabItem>
+<TabItem value="proxy" label="LiteLLM PROXY (Unified Endpoint)">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: multimodalembedding@001
+    litellm_params:
+      model: vertex_ai/multimodalembedding@001
+      vertex_project: "adroit-crow-413218"
+      vertex_location: "us-central1"
+      vertex_credentials: adroit-crow-413218-a956eef1a2a8.json 
+
+litellm_settings:
+  drop_params: True
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request use OpenAI Python SDK, Langchain Python SDK
+
+
+Text + Image 
+
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"],
+)
+
+print(response)
+```
+
+Text + Video 
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"],
+)
+
+print(response)
+```
+
+Image + Video 
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="multimodalembedding@001", 
+    input = ["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"],
+)
+
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+
 ## **Image Generation Models**

 Usage 
--- a/docs/my-website/docs/providers/xai.md
+++ b/docs/my-website/docs/providers/xai.md
@ -82,7 +82,7 @@ from litellm import completion
 os.environ["XAI_API_KEY"] = "your-api-key"

 response = completion(
-    model="xai/grok-2-latest",
+    model="xai/grok-2-vision-latest",
    messages=[
        {
            "role": "user",
--- a/docs/my-website/docs/proxy/access_control.md
+++ b/docs/my-website/docs/proxy/access_control.md
@ -10,17 +10,13 @@ Role-based access control (RBAC) is based on Organizations, Teams and Internal U

 ## Roles

-**Admin Roles**
-  - `proxy_admin`: admin over the platform
-  - `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users
-
-**Organization Roles**
-  - `org_admin`: admin over the organization. Can create teams and users within their organization
-
-**Internal User Roles**
-  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
-  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
-
+| Role Type | Role Name | Permissions |
+|-----------|-----------|-------------|
+| **Admin** | `proxy_admin` | Admin over the platform |
+| | `proxy_admin_viewer` | Can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users |
+| **Organization** | `org_admin` | Admin over the organization. Can create teams and users within their organization |
+| **Internal User** | `internal_user` | Can login, view/create/delete their own keys, view their spend. **Cannot** add new users |
+| | `internal_user_viewer` | Can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users |

 ## Onboarding Organizations 

--- a/docs/my-website/docs/proxy/admin_ui_sso.md
+++ b/docs/my-website/docs/proxy/admin_ui_sso.md
@ -147,11 +147,16 @@ Some SSO providers require a specific redirect url for login and logout. You can
 - Login: `<your-proxy-base-url>/sso/key/generate`
 - Logout: `<your-proxy-base-url>`

+Here's the env var to set the logout url on the proxy
+```bash
+PROXY_LOGOUT_URL="https://www.google.com"
+```
+
 #### Step 3. Set `PROXY_BASE_URL` in your .env

 Set this in your .env (so the proxy can set the correct redirect url)
 ```shell
-PROXY_BASE_URL=https://litellm-api.up.railway.app/
+PROXY_BASE_URL=https://litellm-api.up.railway.app
 ```

 #### Step 4. Test flow
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
        response: str,
    ):
        pass
+
+    aasync def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Any,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        """
+        Passes the entire stream to the guardrail
+
+        This is useful for plugins that need to see the entire stream.
+        """
+        async for item in response:
+            yield item
+
 proxy_handler_instance = MyCustomHandler()
 ```

--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -147,6 +147,7 @@ general_settings:
 |------|------|-------------|
 | completion_model | string | The default model to use for completions when `model` is not specified in the request |
 | disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
+| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
 | disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
 | disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
 | disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
@ -159,7 +160,7 @@ general_settings:
 | database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
 | database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
 | database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
-| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
+| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key [Doc on graceful db unavailability](prod#5-if-running-litellm-on-vpc-gracefully-handle-db-unavailability) |
 | custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
 | max_parallel_requests | integer | The max parallel requests allowed per deployment |
 | global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
@ -177,7 +178,7 @@ general_settings:
 | use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address |
 | service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | 
 | image_generation_model | str | The default model to use for image generation - ignores model set in request |
-| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
+| store_model_in_db | boolean | If true, enables storing model + credential information in the DB. |
 | store_prompts_in_spend_logs | boolean | If true, allows prompts and responses to be stored in the spend logs table. |
 | max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
 | max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
@ -405,6 +406,7 @@ router_settings:
 | HELICONE_API_KEY | API key for Helicone service
 | HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog)
 | HUGGINGFACE_API_BASE | Base URL for Hugging Face API
+| HUGGINGFACE_API_KEY | API key for Hugging Face API
 | IAM_TOKEN_DB_AUTH | IAM token for database authentication
 | JSON_LOGS | Enable JSON formatted logging
 | JWT_AUDIENCE | Expected audience for JWT tokens
@ -447,6 +449,7 @@ router_settings:
 | MICROSOFT_CLIENT_ID | Client ID for Microsoft services
 | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
 | MICROSOFT_TENANT | Tenant ID for Microsoft Azure
+| MICROSOFT_SERVICE_PRINCIPAL_ID | Service Principal ID for Microsoft Enterprise Application. (This is an advanced feature if you want litellm to auto-assign members to Litellm Teams based on their Microsoft Entra ID Groups)
 | NO_DOCS | Flag to disable documentation generation
 | NO_PROXY | List of addresses to bypass proxy
 | OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
@ -478,7 +481,7 @@ router_settings:
 | PROXY_ADMIN_ID | Admin identifier for proxy server
 | PROXY_BASE_URL | Base URL for proxy service
 | PROXY_LOGOUT_URL | URL for logging out of the proxy service
-| PROXY_MASTER_KEY | Master key for proxy authentication
+| LITELLM_MASTER_KEY | Master key for proxy authentication
 | QDRANT_API_BASE | Base URL for Qdrant API
 | QDRANT_API_KEY | API key for Qdrant service
 | QDRANT_URL | Connection URL for Qdrant database
@ -499,9 +502,11 @@ router_settings:
 | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
 | SPEND_LOGS_URL | URL for retrieving spend logs
 | SSL_CERTIFICATE | Path to the SSL certificate file
+| SSL_SECURITY_LEVEL | [BETA] Security level for SSL/TLS connections. E.g. `DEFAULT@SECLEVEL=1`
 | SSL_VERIFY | Flag to enable or disable SSL certificate verification
 | SUPABASE_KEY | API key for Supabase service
 | SUPABASE_URL | Base URL for Supabase instance
+| STORE_MODEL_IN_DB | If true, enables storing model + credential information in the DB. 
 | TEST_EMAIL_ADDRESS | Email address used for testing purposes
 | UI_LOGO_PATH | Path to the logo image used in the UI
 | UI_PASSWORD | Password for accessing the UI
@ -512,5 +517,5 @@ router_settings:
 | UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
 | UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
 | USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
+| USE_PRISMA_MIGRATE | Flag to use prisma migrate instead of prisma db push. Recommended for production environments.
 | WEBHOOK_URL | URL for receiving webhooks from external services
-
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -448,6 +448,34 @@ model_list:

 s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for helping with this. 

+### Centralized Credential Management
+
+Define credentials once and reuse them across multiple models. This helps with:
+- Secret rotation
+- Reducing config duplication
+
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: azure/gpt-4o
+      litellm_credential_name: default_azure_credential  # Reference credential below
+
+credential_list:
+  - credential_name: default_azure_credential
+    credential_values:
+      api_key: os.environ/AZURE_API_KEY  # Load from environment
+      api_base: os.environ/AZURE_API_BASE
+      api_version: "2023-05-15"
+    credential_info:
+      description: "Production credentials for EU region"
+```
+
+#### Key Parameters
+- `credential_name`: Unique identifier for the credential set
+- `credential_values`: Key-value pairs of credentials/secrets (supports `os.environ/` syntax)
+- `credential_info`: Key-value pairs of user provided credentials information.  No key-value pairs are required, but the dictionary must exist.
+
 ### Load API Keys from Secret Managers (Azure Vault, etc)

 [**Using Secret Managers with LiteLLM Proxy**](../secret)
@ -641,4 +669,4 @@ docker run --name litellm-proxy \
   ghcr.io/berriai/litellm-database:main-latest
 ```
 </TabItem>
-</Tabs>
+</Tabs>
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -6,6 +6,8 @@ import Image from '@theme/IdealImage';

 Track spend for keys, users, and teams across 100+ LLMs.

+LiteLLM automatically tracks spend for all known models. See our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+
 ### How to Track Spend with LiteLLM

 **Step 1**
@ -35,10 +37,10 @@ response = client.chat.completions.create(
            "content": "this is a test request, write a short poem"
        }
    ],
-    user="palantir",
-    extra_body={
+    user="palantir", # OPTIONAL: pass user to track spend by user
+    extra_body={ 
        "metadata": {
-            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
        }
    }
 )
@ -63,9 +65,9 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
        "content": "what llm are you"
        }
    ],
-    "user": "palantir",
+    "user": "palantir", # OPTIONAL: pass user to track spend by user
    "metadata": {
-        "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+        "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
    }
 }'
 ```
@ -90,7 +92,7 @@ chat = ChatOpenAI(
    user="palantir",
    extra_body={
        "metadata": {
-            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
        }
    }
 )
@ -150,8 +152,134 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
 </TabItem>
 </Tabs>

-## ✨ (Enterprise) API Endpoints to get Spend
-### Getting Spend Reports - To Charge Other Teams, Customers, Users
+### Allowing Non-Proxy Admins to access `/spend` endpoints 
+
+Use this when you want non-proxy admins to access `/spend` endpoints
+
+:::info
+
+Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+##### Create Key 
+Create Key with with `permissions={"get_spend_routes": true}` 
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+        --header 'Authorization: Bearer sk-1234' \
+        --header 'Content-Type: application/json' \
+        --data '{
+            "permissions": {"get_spend_routes": true}
+    }'
+```
+
+##### Use generated key on `/spend` endpoints
+
+Access spend Routes with newly generate keys
+```shell
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+  -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A'
+```
+
+
+
+#### Reset Team, API Key Spend - MASTER KEY ONLY
+
+Use `/global/spend/reset` if you want to:
+- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
+
+- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
+
+##### Request 
+Only the `LITELLM_MASTER_KEY` you set can access this route
+```shell
+curl -X POST \
+  'http://localhost:4000/global/spend/reset' \
+  -H 'Authorization: Bearer sk-1234' \
+  -H 'Content-Type: application/json'
+```
+
+##### Expected Responses
+
+```shell
+{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
+```
+
+
+## Set 'base_model' for Cost Tracking (e.g. Azure deployments)
+
+**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
+
+**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
+
+Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+
+Example config with `base_model`
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+    model_info:
+      base_model: azure/gpt-4-1106-preview
+```
+
+## Daily Spend Breakdown API
+
+Retrieve granular daily usage data for a user (by model, provider, and API key) with a single endpoint.
+
+Example Request:
+
+```shell title="Daily Spend Breakdown API" showLineNumbers
+curl -L -X GET 'http://localhost:4000/user/daily/activity?start_date=2025-03-20&end_date=2025-03-27' \
+-H 'Authorization: Bearer sk-...'
+```
+
+```json title="Daily Spend Breakdown API Response" showLineNumbers
+{
+    "results": [
+        {
+            "date": "2025-03-27",
+            "metrics": {
+                "spend": 0.0177072,
+                "prompt_tokens": 111,
+                "completion_tokens": 1711,
+                "total_tokens": 1822,
+                "api_requests": 11
+            },
+            "breakdown": {
+                "models": {
+                    "gpt-4o-mini": {
+                        "spend": 1.095e-05,
+                        "prompt_tokens": 37,
+                        "completion_tokens": 9,
+                        "total_tokens": 46,
+                        "api_requests": 1
+                },
+                "providers": { "openai": { ... }, "azure_ai": { ... } },
+                "api_keys": { "3126b6eaf1...": { ... } }
+            }
+        }
+    ],
+    "metadata": {
+        "total_spend": 0.7274667,
+        "total_prompt_tokens": 280990,
+        "total_completion_tokens": 376674,
+        "total_api_requests": 14
+    }
+}
+```
+
+### API Reference
+
+See our [Swagger API](https://litellm-api.up.railway.app/#/Budget%20%26%20Spend%20Tracking/get_user_daily_activity_user_daily_activity_get) for more details on the `/user/daily/activity` endpoint
+
+## ✨ (Enterprise) Generate Spend Reports 
+
+Use this to charge other teams, customers, users

 Use the `/global/spend/report` endpoint to get spend reports

@ -470,105 +598,6 @@ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end

 </Tabs>

-### Allowing Non-Proxy Admins to access `/spend` endpoints 
-
-Use this when you want non-proxy admins to access `/spend` endpoints
-
-:::info
-
-Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
-
-:::
-
-##### Create Key 
-Create Key with with `permissions={"get_spend_routes": true}` 
-```shell
-curl --location 'http://0.0.0.0:4000/key/generate' \
-        --header 'Authorization: Bearer sk-1234' \
-        --header 'Content-Type: application/json' \
-        --data '{
-            "permissions": {"get_spend_routes": true}
-    }'
-```
-
-##### Use generated key on `/spend` endpoints
-
-Access spend Routes with newly generate keys
-```shell
-curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
-  -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A'
-```
-
-
-
-#### Reset Team, API Key Spend - MASTER KEY ONLY
-
-Use `/global/spend/reset` if you want to:
- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
-
- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
-
-##### Request 
-Only the `LITELLM_MASTER_KEY` you set can access this route
-```shell
-curl -X POST \
-  'http://localhost:4000/global/spend/reset' \
-  -H 'Authorization: Bearer sk-1234' \
-  -H 'Content-Type: application/json'
-```
-
-##### Expected Responses
-
-```shell
-{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
-```
-
-
-
-
-## Spend Tracking for Azure OpenAI Models
-
-Set base model for cost tracking azure image-gen call
-
-#### Image Generation 
-
-```yaml
-model_list: 
-  - model_name: dall-e-3
-    litellm_params:
-        model: azure/dall-e-3-test
-        api_version: 2023-06-01-preview
-        api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-        api_key: os.environ/AZURE_API_KEY
-        base_model: dall-e-3 # 👈 set dall-e-3 as base model
-    model_info:
-        mode: image_generation
-```
-
-#### Chat Completions / Embeddings
-
-**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
-
-**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
-
-Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
-
-Example config with `base_model`
-```yaml
-model_list:
-  - model_name: azure-gpt-3.5
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-      api_version: "2023-07-01-preview"
-    model_info:
-      base_model: azure/gpt-4-1106-preview
-```
-
-## Custom Input/Output Pricing
-
-👉 Head to [Custom Input/Output Pricing](https://docs.litellm.ai/docs/proxy/custom_pricing) to setup custom pricing or your models

 ## ✨ Custom Spend Log metadata

@ -587,4 +616,5 @@ Logging specific key,value pairs in spend logs metadata is an enterprise feature

 Tracking spend with Custom tags is an enterprise feature. [See here](./enterprise.md#tracking-spend-for-custom-tags)

-:::
+:::
+
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -26,10 +26,12 @@ model_list:
  - model_name: sagemaker-completion-model
    litellm_params:
      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+    model_info:
      input_cost_per_second: 0.000420
  - model_name: sagemaker-embedding-model
    litellm_params:
      model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+    model_info:
      input_cost_per_second: 0.000420 
 ```

@ -55,11 +57,33 @@ model_list:
      api_key: os.environ/AZURE_API_KEY
      api_base: os.environ/AZURE_API_BASE
      api_version: os.envrion/AZURE_API_VERSION
+    model_info:
      input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
      output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
 ```

-### Debugging 
+## Override Model Cost Map
+
+You can override [our model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) with your own custom pricing for a mapped model.
+
+Just add a `model_info` key to your model in the config, and override the desired keys.
+
+Example: Override Anthropic's model cost map for the `prod/claude-3-5-sonnet-20241022` model.
+
+```yaml
+model_list:
+  - model_name: "prod/claude-3-5-sonnet-20241022"
+    litellm_params:
+      model: "anthropic/claude-3-5-sonnet-20241022"
+      api_key: os.environ/ANTHROPIC_PROD_API_KEY
+    model_info:
+      input_cost_per_token: 0.000006
+      output_cost_per_token: 0.00003
+      cache_creation_input_token_cost: 0.0000075
+      cache_read_input_token_cost: 0.0000006
+```
+
+## Debugging 

 If you're custom pricing is not being used or you're seeing errors, please check the following:

--- a/docs/my-website/docs/proxy/custom_prompt_management.md
+++ b/docs/my-website/docs/proxy/custom_prompt_management.md
@ -0,0 +1,194 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Custom Prompt Management
+
+Connect LiteLLM to your prompt management system with custom hooks.
+
+## Overview
+
+
+<Image 
+  img={require('../../img/custom_prompt_management.png')}
+  style={{width: '100%', display: 'block', margin: '2rem auto'}}
+/>
+
+
+## How it works
+
+## Quick Start
+
+### 1. Create Your Custom Prompt Manager
+
+Create a class that inherits from `CustomPromptManagement` to handle prompt retrieval and formatting:
+
+**Example Implementation**
+
+Create a new file called `custom_prompt.py` and add this code. The key method here is `get_chat_completion_prompt` you can implement custom logic to retrieve and format prompts based on the `prompt_id` and `prompt_variables`.
+
+```python
+from typing import List, Tuple, Optional
+from litellm.integrations.custom_prompt_management import CustomPromptManagement
+from litellm.types.llms.openai import AllMessageValues
+from litellm.types.utils import StandardCallbackDynamicParams
+
+class MyCustomPromptManagement(CustomPromptManagement):
+    def get_chat_completion_prompt(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        non_default_params: dict,
+        prompt_id: str,
+        prompt_variables: Optional[dict],
+        dynamic_callback_params: StandardCallbackDynamicParams,
+    ) -> Tuple[str, List[AllMessageValues], dict]:
+        """
+        Retrieve and format prompts based on prompt_id.
+        
+        Returns:
+            - model: The model to use
+            - messages: The formatted messages
+            - non_default_params: Optional parameters like temperature
+        """
+        # Example matching the diagram: Add system message for prompt_id "1234"
+        if prompt_id == "1234":
+            # Prepend system message while preserving existing messages
+            new_messages = [
+                {"role": "system", "content": "Be a good Bot!"},
+            ] + messages
+            return model, new_messages, non_default_params
+        
+        # Default: Return original messages if no prompt_id match
+        return model, messages, non_default_params
+
+prompt_management = MyCustomPromptManagement()
+```
+
+### 2. Configure Your Prompt Manager in LiteLLM `config.yaml`
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/gpt-4
+      api_key: os.environ/OPENAI_API_KEY
+
+litellm_settings:
+  callbacks: custom_prompt.prompt_management  # sets litellm.callbacks = [prompt_management]
+```
+
+### 3. Start LiteLLM Gateway
+
+<Tabs>
+<TabItem value="docker" label="Docker Run">
+
+Mount your `custom_logger.py` on the LiteLLM Docker container.
+
+```shell
+docker run -d \
+  -p 4000:4000 \
+  -e OPENAI_API_KEY=$OPENAI_API_KEY \
+  --name my-app \
+  -v $(pwd)/my_config.yaml:/app/config.yaml \
+  -v $(pwd)/custom_logger.py:/app/custom_logger.py \
+  my-app:latest \
+  --config /app/config.yaml \
+  --port 4000 \
+  --detailed_debug \
+```
+
+</TabItem>
+
+<TabItem value="py" label="litellm pip">
+
+```shell
+litellm --config config.yaml --detailed_debug
+```
+
+</TabItem>
+</Tabs>
+
+### 4. Test Your Custom Prompt Manager
+
+When you pass `prompt_id="1234"`, the custom prompt manager will add a system message "Be a good Bot!" to your conversation:
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gemini-1.5-pro",
+    messages=[{"role": "user", "content": "hi"}],
+    prompt_id="1234"
+)
+
+print(response.choices[0].message.content)
+```
+</TabItem>
+
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import HumanMessage
+
+chat = ChatOpenAI(
+    model="gpt-4",
+    openai_api_key="sk-1234",
+    openai_api_base="http://0.0.0.0:4000",
+    extra_body={
+        "prompt_id": "1234"
+    }
+)
+
+messages = []
+response = chat(messages)
+
+print(response.content)
+```
+</TabItem>
+
+<TabItem value="curl" label="Curl">
+
+```shell
+curl -X POST http://0.0.0.0:4000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer sk-1234" \
+-d '{
+    "model": "gemini-1.5-pro",
+    "messages": [{"role": "user", "content": "hi"}],
+    "prompt_id": "1234"
+}'
+```
+</TabItem>
+</Tabs>
+
+The request will be transformed from:
+```json
+{
+    "model": "gemini-1.5-pro",
+    "messages": [{"role": "user", "content": "hi"}],
+    "prompt_id": "1234"
+}
+```
+
+To:
+```json
+{
+    "model": "gemini-1.5-pro",
+    "messages": [
+        {"role": "system", "content": "Be a good Bot!"},
+        {"role": "user", "content": "hi"}
+    ]
+}
+```
+
+
--- a/docs/my-website/docs/proxy/db_deadlocks.md
+++ b/docs/my-website/docs/proxy/db_deadlocks.md
@ -0,0 +1,86 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# High Availability Setup (Resolve DB Deadlocks)
+
+Resolve any Database Deadlocks you see in high traffic by using this setup
+
+## What causes the problem?
+
+LiteLLM writes `UPDATE` and `UPSERT` queries to the DB. When using 10+ instances of LiteLLM, these queries can cause deadlocks since each instance could simultaneously attempt to update the same `user_id`, `team_id`, `key` etc. 
+
+## How the high availability setup fixes the problem
+- All instances will write to a Redis queue instead of the DB. 
+- A single instance will acquire a lock on the DB and flush the redis queue to the DB. 
+
+
+## How it works 
+
+### Stage 1. Each instance writes updates to redis
+
+Each instance will accumlate the spend updates for a key, user, team, etc and write the updates to a redis queue. 
+
+<Image img={require('../../img/deadlock_fix_1.png')}  style={{ width: '900px', height: 'auto' }} />
+<p style={{textAlign: 'left', color: '#666'}}>
+Each instance writes updates to redis
+</p>
+
+
+### Stage 2. A single instance flushes the redis queue to the DB
+
+A single instance will acquire a lock on the DB and flush all elements in the redis queue to the DB. 
+
+- 1 instance will attempt to acquire the lock for the DB update job 
+- The status of the lock is stored in redis
+- If the instance acquires the lock to write to DB
+    - It will read all updates from redis
+    - Aggregate all updates into 1 transaction
+    - Write updates to DB
+    - Release the lock
+- Note: Only 1 instance can acquire the lock at a time, this limits the number of instances that can write to the DB at once
+
+
+<Image img={require('../../img/deadlock_fix_2.png')}  style={{ width: '900px', height: 'auto' }} />
+<p style={{textAlign: 'left', color: '#666'}}>
+A single instance flushes the redis queue to the DB
+</p>
+
+
+## Usage
+
+### Required components
+
+- Redis
+- Postgres
+
+### Setup on LiteLLM config
+
+You can enable using the redis buffer by setting `use_redis_transaction_buffer: true` in the `general_settings` section of your `proxy_config.yaml` file. 
+
+Note: This setup requires litellm to be connected to a redis instance. 
+
+```yaml showLineNumbers title="litellm proxy_config.yaml"
+general_settings:
+  use_redis_transaction_buffer: true
+
+litellm_settings:
+  cache: True
+  cache_params:
+    type: redis
+    supported_call_types: [] # Optional: Set cache for proxy, but not on the actual llm api call
+```
+
+## Monitoring
+
+LiteLLM emits the following prometheus metrics to monitor the health/status of the in memory buffer and redis buffer. 
+
+
+| Metric Name                                         | Description                                                                 | Storage Type |
+|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------|
+| `litellm_pod_lock_manager_size`                     | Indicates which pod has the lock to write updates to the database.         | Redis    |
+| `litellm_in_memory_daily_spend_update_queue_size`   | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user.                 | In-Memory    |
+| `litellm_redis_daily_spend_update_queue_size`       | Number of items in the Redis daily spend update queue.  These are the aggregate spend logs for each user.                    | Redis        |
+| `litellm_in_memory_spend_update_queue_size`         | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory    |
+| `litellm_redis_spend_update_queue_size`             | Redis aggregate spend values for keys, users, teams, etc.                  | Redis        |
+
--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma

 | Table Name | Description | Row Insert Frequency |
 |------------|-------------|---------------------|
-| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
-| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
+| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |

-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## Disable `LiteLLM_SpendLogs`

 You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.

 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```

 ### What is the impact of disabling these logs?
--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -23,6 +23,12 @@ In the newly created guard's page, you can find a reference to the prompt policy

 You can decide which detections will be enabled, and set the threshold for each detection.

+:::info 
+When using LiteLLM with virtual keys, key-specific policies can be set directly in Aim's guards page by specifying the virtual key alias when creating the guard.
+
+Only the aliases of your virtual keys (and not the actual key secrets) will be sent to Aim.
+:::
+
 ### 3. Add Aim Guardrail on your LiteLLM config.yaml 

 Define your guardrails under the `guardrails` section
@ -37,7 +43,7 @@ guardrails:
  - guardrail_name: aim-protected-app
    litellm_params:
      guardrail: aim
-      mode: pre_call # 'during_call' is also available
+      mode: [pre_call, post_call] # "During_call" is also available
      api_key: os.environ/AIM_API_KEY
      api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
@ -134,7 +140,7 @@ The above request should not be blocked, and you should receive a regular LLM re

 </Tabs>

-# Advanced
+## Advanced

 Aim Guard provides user-specific Guardrail policies, enabling you to apply tailored policies to individual users.
 To utilize this feature, include the end-user's email in the request payload by setting the `x-aim-user-email` header of your request.
--- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
+++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md
@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail

 ### 1. Write a `CustomGuardrail` Class

-A CustomGuardrail has 3 methods to enforce guardrails 
+A CustomGuardrail has 4 methods to enforce guardrails 
 - `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
 - `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
 - `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
+- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
+

 **[See detailed spec of methods here](#customguardrail-methods)**

@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
                    ):
                        raise ValueError("Guardrail failed Coffee Detected")

+    async def async_post_call_streaming_iterator_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        response: Any,
+        request_data: dict,
+    ) -> AsyncGenerator[ModelResponseStream, None]:
+        """
+        Passes the entire stream to the guardrail
+
+        This is useful for guardrails that need to see the entire response, such as PII masking.
+
+        See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
+
+        Triggered by mode: 'post_call'
+        """
+        async for item in response:
+            yield item

 ```

--- a/docs/my-website/docs/proxy/guardrails/prompt_injection.md
+++ b/docs/my-website/docs/proxy/guardrails/prompt_injection.md
--- a/docs/my-website/docs/proxy/guardrails/quick_start.md
+++ b/docs/my-website/docs/proxy/guardrails/quick_start.md
@ -17,6 +17,14 @@ model_list:
      api_key: os.environ/OPENAI_API_KEY

 guardrails:
+  - guardrail_name: general-guard
+    litellm_params:
+      guardrail: aim
+      mode: [pre_call, post_call]
+      api_key: os.environ/AIM_API_KEY
+      api_base: os.environ/AIM_API_BASE
+      default_on: true # Optional
+  
  - guardrail_name: "aporia-pre-guard"
    litellm_params:
      guardrail: aporia  # supported values: "aporia", "lakera"
@ -45,6 +53,7 @@ guardrails:
 - `pre_call` Run **before** LLM call, on **input**
 - `post_call` Run **after** LLM call, on **input & output**
 - `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call.  Response not returned until guardrail check completes
+- A list of the above values to run multiple modes, e.g. `mode: [pre_call, post_call]`


 ## 2. Start LiteLLM Gateway 
@ -569,4 +578,4 @@ guardrails: Union[

 class DynamicGuardrailParams:
    extra_body: Dict[str, Any]              # Additional parameters for the guardrail
-```
+```
--- a/docs/my-website/docs/proxy/image_handling.md
+++ b/docs/my-website/docs/proxy/image_handling.md
@ -0,0 +1,21 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Image URL Handling 
+
+<Image img={require('../../img/image_handling.png')}  style={{ width: '900px', height: 'auto' }} />
+
+Some LLM API's don't support url's for images, but do support base-64 strings. 
+
+For those, LiteLLM will:
+
+1. Detect a URL being passed
+2. Check if the LLM API supports a URL
+3. Else, will download the base64 
+4. Send the provider a base64 string. 
+
+
+LiteLLM also caches this result, in-memory to reduce latency for subsequent calls. 
+
+The limit for an in-memory cache is 1MB. 
--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -78,6 +78,8 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `api_base` | `Optional[str]` | Optional API base URL |
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
+| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
+| `litellm_model_name` | `Optional[str]` | Model name sent in request |

 ## StandardLoggingModelInformation

--- a/docs/my-website/docs/proxy/master_key_rotations.md
+++ b/docs/my-website/docs/proxy/master_key_rotations.md
@ -0,0 +1,53 @@
+# Rotating Master Key
+
+Here are our recommended steps for rotating your master key.
+
+
+**1. Backup your DB**
+In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
+
+**2. Call `/key/regenerate` with the new master key**
+
+```bash
+curl -L -X POST 'http://localhost:4000/key/regenerate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{
+  "key": "sk-1234",
+  "new_master_key": "sk-PIp1h0RekR"
+}'
+```
+
+This will re-encrypt any models in your Proxy_ModelTable with the new master key.
+
+Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
+
+```bash
+   raise Exception("Unable to decrypt value={}".format(v))
+Exception: Unable to decrypt value=<new-encrypted-value>
+```
+
+**3. Update LITELLM_MASTER_KEY**
+
+In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
+
+This ensures the key used for decryption from db is the new key.
+
+**4. Test it**
+
+Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
+
+```bash
+ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
+    "messages": [
+        {
+            "content": "Hey, how's it going",
+            "role": "user"
+        }
+    ],
+}'
+```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -94,22 +94,38 @@ This disables the load_dotenv() functionality, which will automatically load you

 ## 5. If running LiteLLM on VPC, gracefully handle DB unavailability

-This will allow LiteLLM to continue to process requests even if the DB is unavailable. This is better handling for DB unavailability.
+When running LiteLLM on a VPC (and inaccessible from the public internet), you can enable graceful degradation so that request processing continues even if the database is temporarily unavailable.
+

 **WARNING: Only do this if you're running LiteLLM on VPC, that cannot be accessed from the public internet.**

-```yaml
+#### Configuration
+
+```yaml showLineNumbers title="litellm config.yaml"
 general_settings:
  allow_requests_on_db_unavailable: True
 ```

+#### Expected Behavior
+
+When `allow_requests_on_db_unavailable` is set to `true`, LiteLLM will handle errors as follows:
+
+| Type of Error | Expected Behavior | Details |
+|---------------|-------------------|----------------|
+| Prisma Errors | ✅ Request will be allowed | Covers issues like DB connection resets or rejections from the DB via Prisma, the ORM used by LiteLLM. |
+| Httpx Errors | ✅ Request will be allowed | Occurs when the database is unreachable, allowing the request to proceed despite the DB outage. |
+| Pod Startup Behavior | ✅ Pods start regardless | LiteLLM Pods will start even if the database is down or unreachable, ensuring higher uptime guarantees for deployments. |
+| Health/Readiness Check | ✅ Always returns 200 OK | The /health/readiness endpoint returns a 200 OK status to ensure that pods remain operational even when the database is unavailable.
+| LiteLLM Budget Errors or Model Errors | ❌ Request will be blocked | Triggered when the DB is reachable but the authentication token is invalid, lacks access, or exceeds budget limits. |
+
+
 ## 6. Disable spend_logs & error_logs if not using the LiteLLM UI

 By default, LiteLLM writes several types of logs to the database:
 - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+- LLM Exceptions to the `LiteLLM_SpendLogs` table

-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:

 ```yaml
 general_settings:
@ -161,6 +177,50 @@ export LITELLM_SALT_KEY="sk-1234"

 [**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)

+
+## 9. Use `prisma migrate deploy`
+
+Use this to handle db migrations across LiteLLM versions in production
+
+<Tabs>
+<TabItem value="env" label="ENV">
+
+```bash
+USE_PRISMA_MIGRATE="True"
+```
+
+</TabItem>
+
+<TabItem value="cli" label="CLI">
+
+```bash
+litellm --use_prisma_migrate
+```
+
+</TabItem>
+</Tabs>
+
+Benefits:
+
+The migrate deploy command:
+
+- **Does not** issue a warning if an already applied migration is missing from migration history
+- **Does not** detect drift (production database schema differs from migration history end state - for example, due to a hotfix)
+- **Does not** reset the database or generate artifacts (such as Prisma Client)
+- **Does not** rely on a shadow database
+
+
+### How does LiteLLM handle DB migrations in production?
+
+1. A new migration file is written to our `litellm-proxy-extras` package. [See all](https://github.com/BerriAI/litellm/tree/main/litellm-proxy-extras/litellm_proxy_extras/migrations)
+
+2. The core litellm pip package is bumped to point to the new `litellm-proxy-extras` package. This ensures, older versions of LiteLLM will continue to use the old migrations. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/pyproject.toml#L58)
+
+3. When you upgrade to a new version of LiteLLM, the migration file is applied to the database. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/litellm-proxy-extras/litellm_proxy_extras/utils.py#L42)
+
+
+
+
 ## Extras
 ### Expected Performance in Production

@ -182,94 +242,4 @@ You should only see the following level of details in logs on the proxy server
 # INFO:     192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK
 # INFO:     192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
 # INFO:     192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
-```
-
-
-### Machine Specifications to Deploy LiteLLM
-
-| Service | Spec | CPUs | Memory | Architecture | Version|
-| --- | --- | --- | --- | --- | --- | 
-| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
-| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
-
-
-### Reference Kubernetes Deployment YAML
-
-Reference Kubernetes `deployment.yaml` that was load tested by us
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: litellm-deployment
-spec:
-  replicas: 3
-  selector:
-    matchLabels:
-      app: litellm
-  template:
-    metadata:
-      labels:
-        app: litellm
-    spec:
-      containers:
-        - name: litellm-container
-          image: ghcr.io/berriai/litellm:main-latest
-          imagePullPolicy: Always
-          env:
-            - name: AZURE_API_KEY
-              value: "d6******"
-            - name: AZURE_API_BASE
-              value: "https://ope******"
-            - name: LITELLM_MASTER_KEY
-              value: "sk-1234"
-            - name: DATABASE_URL
-              value: "po**********"
-          args:
-            - "--config"
-            - "/app/proxy_config.yaml"  # Update the path to mount the config file
-          volumeMounts:                 # Define volume mount for proxy_config.yaml
-            - name: config-volume
-              mountPath: /app
-              readOnly: true
-          livenessProbe:
-            httpGet:
-              path: /health/liveliness
-              port: 4000
-            initialDelaySeconds: 120
-            periodSeconds: 15
-            successThreshold: 1
-            failureThreshold: 3
-            timeoutSeconds: 10
-          readinessProbe:
-            httpGet:
-              path: /health/readiness
-              port: 4000
-            initialDelaySeconds: 120
-            periodSeconds: 15
-            successThreshold: 1
-            failureThreshold: 3
-            timeoutSeconds: 10
-      volumes:  # Define volume to mount proxy_config.yaml
-        - name: config-volume
-          configMap:
-            name: litellm-config  
-
-```
-
-
-Reference Kubernetes `service.yaml` that was load tested by us
-```yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: litellm-service
-spec:
-  selector:
-    app: litellm
-  ports:
-    - protocol: TCP
-      port: 4000
-      targetPort: 4000
-  type: LoadBalancer
-```
+```
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -242,6 +242,19 @@ litellm_settings:
 | `litellm_redis_fails`         | Number of failed redis calls    |
 | `litellm_self_latency`         | Histogram latency for successful litellm api call    |

+#### DB Transaction Queue Health Metrics
+
+Use these metrics to monitor the health of the DB Transaction Queue. Eg. Monitoring the size of the in-memory and redis buffers. 
+
+| Metric Name                                         | Description                                                                 | Storage Type |
+|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------|
+| `litellm_pod_lock_manager_size`                     | Indicates which pod has the lock to write updates to the database.         | Redis    |
+| `litellm_in_memory_daily_spend_update_queue_size`   | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user.                 | In-Memory    |
+| `litellm_redis_daily_spend_update_queue_size`       | Number of items in the Redis daily spend update queue.  These are the aggregate spend logs for each user.                    | Redis        |
+| `litellm_in_memory_spend_update_queue_size`         | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory    |
+| `litellm_redis_spend_update_queue_size`             | Redis aggregate spend values for keys, users, teams, etc.                  | Redis        |
+
+

 ## **🔥 LiteLLM Maintained Grafana Dashboards **

@ -268,6 +281,17 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das



+## Add authentication on /metrics endpoint
+
+**By default /metrics endpoint is unauthenticated.** 
+
+You can opt into running litellm authentication on the /metrics endpoint by setting the following on the config 
+
+```yaml
+litellm_settings:
+  require_auth_for_metrics_endpoint: true
+```
+
 ## FAQ 

 ### What are `_created` vs. `_total` metrics?
--- a/docs/my-website/docs/proxy/prompt_management.md
+++ b/docs/my-website/docs/proxy/prompt_management.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [BETA] Prompt Management
+# Prompt Management

 :::info

@ -12,9 +12,10 @@ This feature is currently in beta, and might change unexpectedly. We expect this

 Run experiments or change the specific model (e.g. from gpt-4o to gpt4o-mini finetune) from your prompt management tool (e.g. Langfuse) instead of making changes in the application. 

-Supported Integrations:
- [Langfuse](https://langfuse.com/docs/prompts/get-started)
- [Humanloop](../observability/humanloop)
+| Supported Integrations | Link |
+|------------------------|------|
+| Langfuse               | [Get Started](https://langfuse.com/docs/prompts/get-started) |
+| Humanloop              | [Get Started](../observability/humanloop) |

 ## Quick Start

--- a/docs/my-website/docs/proxy/release_cycle.md
+++ b/docs/my-website/docs/proxy/release_cycle.md
@ -4,9 +4,17 @@ Litellm Proxy has the following release cycle:

 - `v1.x.x-nightly`: These are releases which pass ci/cd. 
 - `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
- `v1.x.x`: These are releases which pass ci/cd + manual review + 3 days of production testing.
+- `v1.x.x:main-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.

-In production, we recommend using the latest `v1.x.x` release.
+In production, we recommend using the latest `v1.x.x:main-stable` release.


-Follow our release notes [here](https://github.com/BerriAI/litellm/releases).
+Follow our release notes [here](https://github.com/BerriAI/litellm/releases).
+
+
+## FAQ
+
+### Is there a release schedule for LiteLLM stable release?
+
+Stable releases come out every week (typically Sunday)
+
--- a/docs/my-website/docs/proxy/response_headers.md
+++ b/docs/my-website/docs/proxy/response_headers.md
@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
 | `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |

 ## Cost Tracking Headers
-| Header | Type | Description |
-|--------|------|-------------|
-| `x-litellm-response-cost` | float | Cost of the API call |
-| `x-litellm-key-spend` | float | Total spend for the API key |
+| Header | Type | Description | Available on Pass-Through Endpoints |
+|--------|------|-------------|-------------|
+| `x-litellm-response-cost` | float | Cost of the API call | |
+| `x-litellm-key-spend` | float | Total spend for the API key | ✅ |

 ## LiteLLM Specific Headers
-| Header | Type | Description |
-|--------|------|-------------|
-| `x-litellm-call-id` | string | Unique identifier for the API call |
-| `x-litellm-model-id` | string | Unique identifier for the model used |
-| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
-| `x-litellm-version` | string | Version of LiteLLM being used |
-| `x-litellm-model-group` | string | Model group identifier |
+| Header | Type | Description | Available on Pass-Through Endpoints |
+|--------|------|-------------|-------------|
+| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
+| `x-litellm-model-id` | string | Unique identifier for the model used | |
+| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
+| `x-litellm-version` | string | Version of LiteLLM being used | |
+| `x-litellm-model-group` | string | Model group identifier | |

 ## Response headers from LLM providers

--- a/docs/my-website/docs/proxy/self_serve.md
+++ b/docs/my-website/docs/proxy/self_serve.md
@ -161,6 +161,89 @@ Here's the available UI roles for a LiteLLM Internal User:
  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.

+## Auto-add SSO users to teams
+
+This walks through setting up sso auto-add for **Okta, Google SSO**
+
+### Okta, Google SSO 
+
+1. Specify the JWT field that contains the team ids, that the user belongs to. 
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  litellm_jwtauth:
+    team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD
+```
+
+This is assuming your SSO token looks like this. **If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions [here](#debugging-sso-jwt-fields)**
+
+```
+{
+  ...,
+  "groups": ["team_id_1", "team_id_2"]
+}
+```
+
+2. Create the teams on LiteLLM 
+
+```bash
+curl -X POST '<PROXY_BASE_URL>/team/new' \
+-H 'Authorization: Bearer <PROXY_MASTER_KEY>' \
+-H 'Content-Type: application/json' \
+-D '{
+    "team_alias": "team_1",
+    "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID
+}'
+```
+
+3. Test the SSO flow
+
+Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e)
+
+### Microsoft Entra ID SSO group assignment
+
+This walks through setting up sso auto-add for **Microsoft Entra ID**
+
+Follow along this video for a walkthrough of how to set this up with Microsoft Entra ID
+
+
+<iframe width="840" height="500" src="https://www.loom.com/embed/ea711323aa9a496d84a01fd7b2a12f54?sid=c53e238c-5bfd-4135-b8fb-b5b1a08632cf" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
+
+### Debugging SSO JWT fields 
+
+If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions. This guide walks you through setting up a debug callback to view the JWT data during the SSO process.
+
+
+<Image img={require('../../img/debug_sso.png')}  style={{ width: '500px', height: 'auto' }} />
+<br />
+
+1. Add `/sso/debug/callback` as a redirect URL in your SSO provider 
+
+  In your SSO provider's settings, add the following URL as a new redirect (callback) URL:
+
+  ```bash showLineNumbers title="Redirect URL"
+  http://<proxy_base_url>/sso/debug/callback
+  ```
+
+
+2. Navigate to the debug login page on your browser 
+
+    Navigate to the following URL on your browser:
+
+    ```bash showLineNumbers title="URL to navigate to"
+    https://<proxy_base_url>/sso/debug/login
+    ```
+
+    This will initiate the standard SSO flow. You will be redirected to your SSO provider's login screen, and after successful authentication, you will be redirected back to LiteLLM's debug callback route.
+
+
+3. View the JWT fields 
+
+Once redirected, you should see a page called "SSO Debug Information". This page displays the JWT fields received from your SSO provider (as shown in the image above)
+
+
 ## Advanced
 ### Setting custom logout URLs

@ -196,40 +279,6 @@ This budget does not apply to keys created under non-default teams.

 [**Go Here**](./team_budgets.md)

-### Auto-add SSO users to teams
-
-1. Specify the JWT field that contains the team ids, that the user belongs to. 
-
-```yaml
-general_settings:
-  master_key: sk-1234
-  litellm_jwtauth:
-    team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD
-```
-
-This is assuming your SSO token looks like this:
-```
-{
-  ...,
-  "groups": ["team_id_1", "team_id_2"]
-}
-```
-
-2. Create the teams on LiteLLM 
-
-```bash
-curl -X POST '<PROXY_BASE_URL>/team/new' \
-H 'Authorization: Bearer <PROXY_MASTER_KEY>' \
-H 'Content-Type: application/json' \
-D '{
-    "team_alias": "team_1",
-    "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID
-}'
-```
-
-3. Test the SSO flow
-
-Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e)

 ### Restrict Users from creating personal keys 

--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -102,7 +102,19 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
 </TabItem>
 </Tabs>

-## Advanced - Set Accepted JWT Scope Names 
+## Advanced
+
+### Multiple OIDC providers
+
+Use this if you want LiteLLM to validate your JWT against multiple OIDC providers (e.g. Google Cloud, GitHub Auth)
+
+Set `JWT_PUBLIC_KEY_URL` in your environment to a comma-separated list of URLs for your OIDC providers.
+
+```bash
+export JWT_PUBLIC_KEY_URL="https://demo.duendesoftware.com/.well-known/openid-configuration/jwks,https://accounts.google.com/.well-known/openid-configuration/jwks"
+```
+
+### Set Accepted JWT Scope Names 

 Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.

@ -114,7 +126,7 @@ general_settings:
    admin_jwt_scope: "litellm-proxy-admin"
 ```

-## Tracking End-Users / Internal Users / Team / Org
+### Tracking End-Users / Internal Users / Team / Org

 Set the field in the jwt token, which corresponds to a litellm user / team / org.

@ -156,7 +168,7 @@ scope: ["litellm-proxy-admin",...]
 scope: "litellm-proxy-admin ..."
 ```

-## Control model access with Teams
+### Control model access with Teams


 1. Specify the JWT field that contains the team ids, that the user belongs to. 
@ -207,11 +219,11 @@ OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a
 - If all checks pass, allow the request


-## Advanced - Custom Validate
+### Custom JWT Validate

 Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.

-### 1. Setup custom validate function
+#### 1. Setup custom validate function

 ```python
 from typing import Literal
@ -230,7 +242,7 @@ def my_custom_validate(token: str) -> Literal[True]:
  return True
 ```

-### 2. Setup config.yaml
+#### 2. Setup config.yaml

 ```yaml
 general_settings:
@ -243,7 +255,7 @@ general_settings:
    custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
 ```

-### 3. Test the flow
+#### 3. Test the flow

 **Expected JWT**

@ -265,7 +277,7 @@ general_settings:



-## Advanced - Allowed Routes 
+### Allowed Routes 

 Configure which routes a JWT can access via the config.

@ -297,7 +309,7 @@ general_settings:
    team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
 ```

-## Advanced - Caching Public Keys 
+### Caching Public Keys 

 Control how long public keys are cached for (in seconds).

@ -311,7 +323,7 @@ general_settings:
    public_key_ttl: 600 # 👈 KEY CHANGE
 ```

-## Advanced - Custom JWT Field 
+### Custom JWT Field 

 Set a custom field in which the team_id exists. By default, the 'client_id' field is checked. 

@ -323,14 +335,7 @@ general_settings:
    team_id_jwt_field: "client_id" # 👈 KEY CHANGE
 ```

-## All Params
-
-[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
-
-
-
-
-## Advanced - Block Teams 
+### Block Teams 

 To block all requests for a certain team id, use `/team/block`

@ -357,7 +362,7 @@ curl --location 'http://0.0.0.0:4000/team/unblock' \
 ```


-## Advanced - Upsert Users + Allowed Email Domains 
+### Upsert Users + Allowed Email Domains 

 Allow users who belong to a specific email domain, automatic access to the proxy.
 
@ -494,4 +499,10 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
    }
  ]
 }'
-```
+```
+
+## All JWT Params
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
+
+
--- a/docs/my-website/docs/proxy/ui_credentials.md
+++ b/docs/my-website/docs/proxy/ui_credentials.md
@ -0,0 +1,55 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Adding LLM Credentials
+
+You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
+
+## Add a credential + model
+
+### 1. Navigate to LLM Credentials page
+
+Go to Models -> LLM Credentials -> Add Credential
+
+<Image img={require('../../img/ui_cred_add.png')} />
+
+### 2. Add credentials
+
+Select your LLM provider, enter your API Key and click "Add Credential"
+
+**Note: Credentials are based on the provider, if you select Vertex AI then you will see `Vertex Project`, `Vertex Location` and `Vertex Credentials` fields**
+
+<Image img={require('../../img/ui_add_cred_2.png')} />
+
+
+### 3. Use credentials when adding a model
+
+Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
+
+<Image img={require('../../img/ui_cred_3.png')} />
+
+
+## Create a Credential from an existing model
+
+Use this if you have already created a model and want to store the model credentials for future use
+
+### 1. Select model to create a credential from
+
+Go to Models -> Select your model -> Credential -> Create Credential
+
+<Image img={require('../../img/ui_cred_4.png')} />
+
+### 2. Use new credential when adding a model
+
+Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
+
+<Image img={require('../../img/use_model_cred.png')} />
+
+## Frequently Asked Questions
+
+
+How are credentials stored?
+Credentials in the DB are encrypted/decrypted using `LITELLM_SALT_KEY`, if set. If not, then they are encrypted using `LITELLM_MASTER_KEY`. These keys should be kept secret and not shared with others.
+
+
--- a/docs/my-website/docs/proxy/ui_logs.md
+++ b/docs/my-website/docs/proxy/ui_logs.md
@ -0,0 +1,55 @@
+
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# UI Logs Page
+
+View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM
+
+
+<Image img={require('../../img/ui_request_logs.png')}/>
+
+
+## Overview
+
+| Log Type | Tracked by Default |
+|----------|-------------------|
+| Success Logs | ✅ Yes |
+| Error Logs | ✅ Yes |
+| Request/Response Content Stored | ❌ No by Default, **opt in with `store_prompts_in_spend_logs`** |
+
+
+
+**By default LiteLLM does not track the request and response content.**
+
+## Tracking - Request / Response Content in Logs Page 
+
+If you want to view request and response content on LiteLLM Logs, you need to opt in with this setting
+
+```yaml
+general_settings:
+  store_prompts_in_spend_logs: true
+```
+
+<Image img={require('../../img/ui_request_logs_content.png')}/>
+
+
+## Stop storing Error Logs in DB
+
+If you do not want to store error logs in DB, you can opt out with this setting
+
+```yaml
+general_settings:
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
+```
+
+## Stop storing Spend Logs in DB
+
+If you do not want to store spend logs in DB, you can opt out with this setting
+
+```yaml
+general_settings:
+  disable_spend_logs: True   # Disable writing spend logs to DB
+```
+
--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Realtime Endpoints
+# /realtime

 Use this to loadbalance across Azure + OpenAI. 

--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -3,11 +3,20 @@ import TabItem from '@theme/TabItem';

 # 'Thinking' / 'Reasoning Content'

+:::info
+
+Requires LiteLLM v1.63.0+
+
+:::
+
 Supported Providers:
 - Deepseek (`deepseek/`)
 - Anthropic API (`anthropic/`)
 - Bedrock (Anthropic + Deepseek) (`bedrock/`)
 - Vertex AI (Anthropic) (`vertexai/`)
+- OpenRouter (`openrouter/`)
+
+LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.

 ```python
 "message": {
@ -17,7 +26,7 @@ Supported Providers:
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
        }
    ]
 }
@ -39,7 +48,7 @@ response = completion(
  messages=[
    {"role": "user", "content": "What is the capital of France?"},
  ],
-  thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`)
+  reasoning_effort="low", 
 )
 print(response.choices[0].message.content)
 ```
@ -59,7 +68,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
        "content": "What is the capital of France?"
      }
    ],
-    "thinking": {"type": "enabled", "budget_tokens": 1024}
+    "reasoning_effort": "low"
 }'
 ```
 </TabItem>
@ -141,7 +150,7 @@ response = litellm.completion(
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
-    thinking={"type": "enabled", "budget_tokens": 1024},
+    reasoning_effort="low",
 )
 print("Response\n", response)
 response_message = response.choices[0].message
@ -189,9 +198,9 @@ if tool_calls:
        model=model,
        messages=messages,
        seed=22,
+        reasoning_effort="low",
        # tools=tools,
        drop_params=True,
-        thinking={"type": "enabled", "budget_tokens": 1024},
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
@ -292,7 +301,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
          {
            "type": "thinking",
            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
-            "signature_delta": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
+            "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
          }
        ],
        "provider_specific_fields": {
@ -331,7 +340,7 @@ litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
 response = litellm.completion(
  model="anthropic/claude-3-7-sonnet-20250219",
  messages=[{"role": "user", "content": "What is the capital of France?"}],
-  thinking={"type": "enabled", "budget_tokens": 1024},
+  reasoning_effort="low",
  drop_params=True,
 )

@ -339,7 +348,7 @@ response = litellm.completion(
 response = litellm.completion(
  model="deepseek/deepseek-chat",
  messages=[{"role": "user", "content": "What is the capital of France?"}],
-  thinking={"type": "enabled", "budget_tokens": 1024},
+  reasoning_effort="low",
  drop_params=True,
 )
 ```
@ -353,5 +362,38 @@ These fields can be accessed via `response.choices[0].message.reasoning_content`
 - `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
  - `type` - str: The type of thinking block.
  - `thinking` - str: The thinking from the model.
-  - `signature_delta` - str: The signature delta from the model.
+  - `signature` - str: The signature delta from the model.

+
+
+## Pass `thinking` to Anthropic models
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = litellm.completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "anthropic/claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -1,4 +1,4 @@
-# Rerank
+# /rerank

 :::tip

--- a/Show more
+++ b/Show more