Merge branch 'main' into sync-logging

2025-04-26 03:04:13 +00:00 · 2025-03-20 01:48:22 +09:00 · 2025-03-20 01:48:22 +09:00 · a3fc795927
commit a3fc795927
parent 5d8b359384 edc38280ea
484 changed files with 27932 additions and 7615 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -49,7 +49,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -71,7 +71,7 @@ jobs:
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
            pip install "pytest-xdist==3.6.1"
-            pip install "websockets==10.4"
+            pip install "websockets==13.1.0"
            pip uninstall posthog -y
      - save_cache:
          paths:
@ -168,7 +168,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -189,6 +189,7 @@ jobs:
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -267,7 +268,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -288,6 +289,7 @@ jobs:
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -511,7 +513,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -678,6 +680,48 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
  llm_responses_api_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
      - checkout
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
            pip install "pytest-cov==5.0.0"
            pip install "pytest-asyncio==0.21.1"
            pip install "respx==0.21.1"
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
          command: |
            pwd
            ls
            python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      - run:
          name: Rename the coverage files
          command: |
            mv coverage.xml llm_responses_api_coverage.xml
            mv .coverage llm_responses_api_coverage
      # Store test results
      - store_test_results:
          path: test-results
      - persist_to_workspace:
          root: .
          paths:
            - llm_responses_api_coverage.xml
            - llm_responses_api_coverage
  litellm_mapped_tests:
    docker:
      - image: cimg/python:3.11
@ -1234,7 +1278,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Install Grype
          command: |
@ -1309,13 +1353,13 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m
      # Store test results
      - store_test_results:
          path: test-results
-  e2e_openai_misc_endpoints:
+  e2e_openai_endpoints:
    machine:
      image: ubuntu-2204:2023.10.1
    resource_class: xlarge
@ -1370,7 +1414,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1432,7 +1476,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/openai_misc_endpoints_tests --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -s -vv tests/openai_endpoints_tests --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      # Store test results
@ -1492,7 +1536,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1921,7 +1965,7 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            pip install "assemblyai==0.37.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
@ -1935,12 +1979,12 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install "anthropic==0.21.3"
+            pip install "anthropic==0.49.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -2068,7 +2112,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
+            coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -2197,7 +2241,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
@ -2387,7 +2431,7 @@ workflows:
              only:
                - main
                - /litellm_.*/
-      - e2e_openai_misc_endpoints:
+      - e2e_openai_endpoints:
          filters:
            branches:
              only:
@ -2429,6 +2473,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
      - llm_responses_api_testing:
          filters:
            branches:
              only:
                - main
                - /litellm_.*/
      - litellm_mapped_tests:
          filters:
            branches:
@ -2468,6 +2518,7 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2522,10 +2573,11 @@ workflows:
          requires:
            - local_testing
            - build_and_test
-            - e2e_openai_misc_endpoints
+            - e2e_openai_endpoints
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,5 +1,5 @@
 # used by CI/CD testing
-openai==1.54.0 
+openai==1.66.1
 python-dotenv
 tiktoken
 importlib_metadata
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -6,6 +6,16 @@
 <!-- e.g. "Fixes #000" -->
 ## Pre-Submission checklist
 **Please complete all items before asking a LiteLLM maintainer to review your PR**
 - [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
 - [ ] I have added a screenshot of my new test passing locally 
 - [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
 - [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
 ## Type
 <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@
 ## Changes
 <!-- List of changes -->
 ## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
 If UI changes, send a screenshot/GIF of working UI fixes
 <!-- Test procedure -->
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -80,7 +80,6 @@ jobs:
    permissions:
      contents: read
      packages: write
      #
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
@ -112,7 +111,11 @@ jobs:
        with:
          context: .
          push: true
-          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
+          tags: |
            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }}
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
@ -151,8 +154,12 @@ jobs:
          context: .
          file: ./docker/Dockerfile.database
          push: true
-          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} 
+          tags: |
-          labels: ${{ steps.meta-database.outputs.labels }} 
+            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta-database.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  build-and-push-image-non_root:
@ -190,7 +197,11 @@ jobs:
          context: .
          file: ./docker/Dockerfile.non_root
          push: true
-          tags: ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} 
+          tags: |
            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }}
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta-non_root.outputs.labels }} 
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
@ -229,7 +240,11 @@ jobs:
          context: .
          file: ./litellm-js/spend-logs/Dockerfile
          push: true
-          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
+          tags: |
            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  build-and-push-helm-chart:
--- a/.github/workflows/helm_unit_test.yml
+++ b/.github/workflows/helm_unit_test.yml
@ -0,0 +1,27 @@
 name: Helm unit test
 on:
  pull_request:
  push:
    branches:
      - main
 jobs:
  unit-test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      - name: Set up Helm 3.11.1
        uses: azure/setup-helm@v1
        with:
          version: '3.11.1'
      - name: Install Helm Unit Test Plugin
        run: |
          helm plugin install https://github.com/helm-unittest/helm-unittest --version v0.4.4
      - name: Run unit tests
        run:
          helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/32
+++ b/32
@ -0,0 +1,32 @@
 # LiteLLM Makefile
 # Simple Makefile for running tests and basic development tasks
 .PHONY: help test test-unit test-integration lint format
 # Default target
 help:
 	@echo "Available commands:"
 	@echo "  make test               - Run all tests"
 	@echo "  make test-unit          - Run unit tests"
 	@echo "  make test-integration   - Run integration tests"
 	@echo "  make test-unit-helm     - Run helm unit tests"
 install-dev:
 	poetry install --with dev
 lint: install-dev
 	poetry run pip install types-requests types-setuptools types-redis types-PyYAML
 	cd litellm && poetry run mypy . --ignore-missing-imports
 # Testing
 test:
 	poetry run pytest tests/
 test-unit:
 	poetry run pytest tests/litellm/
 test-integration:
 	poetry run pytest tests/ -k "not litellm"
 test-unit-helm:
 	helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/README.md
+++ b/README.md
@ -40,7 +40,7 @@ LiteLLM manages:
 [**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
-🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 
+🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)
 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
@ -340,71 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 ## Contributing
-To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
+Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)
 Here's how to modify the repo locally:
 Step 1: Clone the repo
 ```
 git clone https://github.com/BerriAI/litellm.git
 ```
 Step 2: Install dependencies:
 ```
 pip install -r requirements.txt
 ```
 Step 3: Test your change:
 a. Add a pytest test within `tests/litellm/`
 This folder follows the same directory structure as `litellm/`.
 If a corresponding test file does not exist, create one.
 b. Run the test
 ```
 cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
 pytest /path/to/test_file.py
 ```
 Step 4: Submit a PR with your changes! 🚀
 - push your fork to your GitHub repo
 - submit a PR from there
 ### Building LiteLLM Docker Image 
 Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
 Step 1: Clone the repo
 ```
 git clone https://github.com/BerriAI/litellm.git
 ```
 Step 2: Build the Docker Image
 Build using Dockerfile.non_root
 ```
 docker build -f docker/Dockerfile.non_root -t litellm_test_image .
 ```
 Step 3: Run the Docker Image
 Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
 ```
 docker run \
    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
    -e DATABASE_URL="postgresql://xxxxxxxx" \
    -e LITELLM_MASTER_KEY="sk-1234" \
    -p 4000:4000 \
    litellm_test_image \
    --config /app/config.yaml --detailed_debug
 ```
 # Enterprise
 For companies that need better security, user management and professional support
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.4.1
+version: 0.4.2
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -22,6 +22,8 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | Name                                                       | Description                                                                                                                                                                           | Value |
 | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
 | `replicaCount`                                             | The number of LiteLLM Proxy pods to be deployed                                                                                                                                       | `1`  |
 | `masterkeySecretName`                                      | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use the generated secret name.                                                                                                         | N/A  |
 | `masterkeySecretKey`                                      | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use `masterkey` as the key.                                                                                                         | N/A  |
 | `masterkey`                                                | The Master API Key for LiteLLM.  If not specified, a random key is generated.                                                                                                         | N/A  |
 | `environmentSecrets`                                       | An optional array of Secret object names.  The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
 | `environmentConfigMaps`                                       | An optional array of ConfigMap object names.  The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -78,8 +78,8 @@ spec:
            - name: PROXY_MASTER_KEY
              valueFrom:
                secretKeyRef:
-                  name: {{ include "litellm.fullname" . }}-masterkey
+                  name: {{ .Values.masterkeySecretName | default (printf "%s-masterkey" (include "litellm.fullname" .)) }}
-                  key: masterkey
+                  key: {{ .Values.masterkeySecretKey | default "masterkey" }}
            {{- if .Values.redis.enabled }}
            - name: REDIS_HOST
              value: {{ include "litellm.redis.serviceName" . }}
--- a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
+++ b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
@ -1,3 +1,4 @@
 {{- if not .Values.masterkeySecretName }}
 {{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
 apiVersion: v1
 kind: Secret
@ -5,4 +6,5 @@ metadata:
  name: {{ include "litellm.fullname" . }}-masterkey
 data:
  masterkey: {{ $masterkey | b64enc }}
-type: Opaque
+type: Opaque
 {{- end }}
--- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml
@ -0,0 +1,82 @@
 suite: test deployment
 templates:
  - deployment.yaml
  - configmap-litellm.yaml
 tests:
  - it: should work
    template: deployment.yaml
    set:
      image.tag: test
    asserts:
      - isKind:
          of: Deployment
      - matchRegex:
          path: metadata.name
          pattern: -litellm$
      - equal:
          path: spec.template.spec.containers[0].image
          value: ghcr.io/berriai/litellm-database:test
  - it: should work with tolerations
    template: deployment.yaml
    set:
      tolerations:
        - key: node-role.kubernetes.io/master
          operator: Exists
          effect: NoSchedule
    asserts:
      - equal:
          path: spec.template.spec.tolerations[0].key
          value: node-role.kubernetes.io/master
      - equal:
          path: spec.template.spec.tolerations[0].operator
          value: Exists
  - it: should work with affinity
    template: deployment.yaml
    set:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: topology.kubernetes.io/zone
                operator: In
                values:
                - antarctica-east1
    asserts:
      - equal:
          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key
          value: topology.kubernetes.io/zone
      - equal:
          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator
          value: In
      - equal:
          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]
          value: antarctica-east1
  - it: should work without masterkeySecretName or masterkeySecretKey
    template: deployment.yaml
    set:
      masterkeySecretName: ""
      masterkeySecretKey: ""
    asserts:
      - contains:
          path: spec.template.spec.containers[0].env
          content:
            name: PROXY_MASTER_KEY
            valueFrom:
              secretKeyRef:
                name: RELEASE-NAME-litellm-masterkey
                key: masterkey
  - it: should work with masterkeySecretName and masterkeySecretKey
    template: deployment.yaml
    set:
      masterkeySecretName: my-secret
      masterkeySecretKey: my-key
    asserts:
      - contains:
          path: spec.template.spec.containers[0].env
          content:
            name: PROXY_MASTER_KEY
            valueFrom:
              secretKeyRef:
                name: my-secret
                key: my-key
--- a/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
@ -0,0 +1,18 @@
 suite: test masterkey secret
 templates:
  - secret-masterkey.yaml
 tests:
  - it: should create a secret if masterkeySecretName is not set
    template: secret-masterkey.yaml
    set:
      masterkeySecretName: ""
    asserts:
      - isKind:
          of: Secret
  - it: should not create a secret if masterkeySecretName is set
    template: secret-masterkey.yaml
    set:
      masterkeySecretName: my-secret
    asserts:
      - hasDocuments:
          count: 0
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -75,6 +75,12 @@ ingress:
 # masterkey: changeit
 # if set, use this secret for the master key; otherwise, autogenerate a new one
 masterkeySecretName: ""
 # if set, use this secret key for the master key; otherwise, use the default key
 masterkeySecretKey: ""
 # The elements within proxy_config are rendered as config.yaml for the proxy
 #  Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
 #  Reference: https://docs.litellm.ai/docs/proxy/configs
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,10 +20,18 @@ services:
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
    depends_on:
      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
    healthcheck:  # Defines the health check configuration for the container
      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
      interval: 30s  # Perform health check every 30 seconds
      timeout: 10s   # Health check command times out after 10 seconds
      retries: 3     # Retry up to 3 times if health check fails
      start_period: 40s  # Wait 40 seconds after container start before beginning health checks
  db:
-    image: postgres
+    image: postgres:16
    restart: always
    environment:
      POSTGRES_DB: litellm
@ -31,6 +39,8 @@ services:
      POSTGRES_PASSWORD: dbpassword9090
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data  # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
@ -53,6 +63,8 @@ services:
 volumes:
  prometheus_data:
    driver: local
  postgres_data:
    name: litellm_postgres_data  # Named volume for Postgres data persistence
 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -0,0 +1,92 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # /v1/messages [BETA] 
 LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
 This currently just supports the Anthropic API. 
 | Feature | Supported | Notes | 
 |-------|-------|-------|
 | Cost Tracking | ✅ |  |
 | Logging | ✅ | works across all integrations |
 | End-user Tracking | ✅ | |
 | Streaming | ✅ | |
 | Fallbacks | ✅ | between anthropic models |
 | Loadbalancing | ✅ | between anthropic models |
 Planned improvement:
 - Vertex AI Anthropic support
 - Bedrock Anthropic support
 ## Usage 
 <Tabs>
 <TabItem label="PROXY" value="proxy">
 1. Setup config.yaml
 ```yaml
 model_list:
    - model_name: anthropic-claude
      litellm_params:
        model: claude-3-7-sonnet-latest
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
 -H 'content-type: application/json' \
 -H 'x-api-key: $LITELLM_API_KEY' \
 -H 'anthropic-version: 2023-06-01' \
 -d '{
  "model": "anthropic-claude",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "List 5 important events in the XIX century"
        }
      ]
    }
  ],
  "max_tokens": 4096
 }'
 ```
 </TabItem>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
 import asyncio 
 import os 
 # set env 
 os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
 messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
 # Call the handler
 async def call(): 
    response = await anthropic_messages(
        messages=messages,
        api_key=api_key,
        model="claude-3-haiku-20240307",
        max_tokens=100,
    )
 asyncio.run(call())
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Assistants API 
+# /assistants
 Covers Threads, Messages, Assistants. 
--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [BETA] Batches API
+# /batches
 Covers Batches, Files
--- a/docs/my-website/docs/completion/prompt_caching.md
+++ b/docs/my-website/docs/completion/prompt_caching.md
@ -3,7 +3,13 @@ import TabItem from '@theme/TabItem';
 # Prompt Caching 
-For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
+Supported Providers:
 - OpenAI (`openai/`)
 - Anthropic API (`anthropic/`)
 - Bedrock (`bedrock/`, `bedrock/invoke/`, `bedrock/converse`) ([All models bedrock supports prompt caching on](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html))
 - Deepseek API (`deepseek/`)
 For the supported providers, LiteLLM follows the OpenAI prompt caching usage object format:
 ```bash
 "usage": {
@ -499,4 +505,4 @@ curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
 </TabItem>
 </Tabs>
-This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -189,4 +189,138 @@ Expected Response
 ```
 </TabItem>
-</Tabs>
+</Tabs>
 ## Explicitly specify image type 
 If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
 ```python
 "image_url": {
  "url": "gs://my-gs-image",
  "format": "image/jpeg"
 }
 ```
 LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai). 
 For others (e.g. openai), it will be ignored. 
 <Tabs>
 <TabItem label="SDK" value="sdk">
 ```python
 import os 
 from litellm import completion
 os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
 # openai call
 response = completion(
    model = "claude-3-7-sonnet-latest", 
    messages=[
        {
            "role": "user",
            "content": [
                            {
                                "type": "text",
                                "text": "What’s in this image?"
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                                  "format": "image/jpeg"
                                }
                            }
                        ]
        }
    ],
 )
 ```
 </TabItem>
 <TabItem label="PROXY" value="proxy">
 1. Define vision models on config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
    litellm_params:
      model: openai/gpt-4-vision-preview
      api_key: os.environ/OPENAI_API_KEY
  - model_name: llava-hf          # Custom OpenAI compatible model
    litellm_params:
      model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
      api_base: http://localhost:8000
      api_key: fake-key
    model_info:
      supports_vision: True        # set supports_vision to True so /model/info returns this attribute as True
 ```
 2. Run proxy server
 ```bash
 litellm --config config.yaml
 ```
 3. Test it using the OpenAI Python SDK
 ```python
 import os 
 from openai import OpenAI
 client = OpenAI(
    api_key="sk-1234", # your litellm proxy api key
 )
 response = client.chat.completions.create(
    model = "gpt-4-vision-preview",  # use model="llava-hf" to test your custom OpenAI endpoint
    messages=[
        {
            "role": "user",
            "content": [
                            {
                                "type": "text",
                                "text": "What’s in this image?"
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                                "format": "image/jpeg"
                                }
                            }
                        ]
        }
    ],
 )
 ```
 </TabItem>
 </Tabs>
 ## Spec 
 ```
 "image_url": str
 OR 
 "image_url": {
  "url": "url OR base64 encoded str",
  "detail": "openai-only param", 
  "format": "specify mime-type of image"
 }
 ```
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
 |-------------------|-------------------------------------------------------------------------------------------------|
 | SOC 2 Type I      | Certified. Report available upon request on Enterprise plan.                                                           |
 | SOC 2 Type II     | In progress. Certificate available by April 15th, 2025                   |
-| ISO27001          | In progress. Certificate available by February 7th, 2025                                           |
+| ISO 27001          | Certified. Report available upon request on Enterprise                              |
 ## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
 Has the Vendor been audited / certified? 
 - SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
 - SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025.
+- ISO 27001. Certified. Report available upon request on Enterprise plan.
 Has an information security management system been implemented? 
 - Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Embeddings
+# /embeddings
 ## Quick Start
 ```python
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -34,9 +34,9 @@ You can use our cloud product where we setup a dedicated instance for you.
 Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting.  We can’t solve your own infrastructure-related issues but we will guide you to fix them.
- 1 hour for Sev0 issues
+- 1 hour for Sev0 issues - 100% production traffic is failing
- 6 hours for Sev1
+- 6 hours for Sev1 - <100% production traffic is failing
- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday)
+- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) - setup issues e.g. Redis working on our end, but not on your infrastructure.
 - 72h SLA for patching vulnerabilities in the software. 
 **We can offer custom SLAs** based on your needs and the severity of the issue
--- a/docs/my-website/docs/extras/contributing_code.md
+++ b/docs/my-website/docs/extras/contributing_code.md
@ -0,0 +1,106 @@
 # Contributing Code
 ## **Checklist before submitting a PR**
 Here are the core requirements for any PR submitted to LiteLLM
 - [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
 - [ ] Ensure your PR passes the following tests:
    - [ ] [Unit Tests](#3-running-unit-tests)
    - [ ] [Formatting / Linting Tests](#35-running-linting-tests)
 - [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
 ## Quick start
 ## 1. Setup your local dev environment
 Here's how to modify the repo locally:
 Step 1: Clone the repo
 ```shell
 git clone https://github.com/BerriAI/litellm.git
 ```
 Step 2: Install dev dependencies:
 ```shell
 poetry install --with dev --extras proxy
 ```
 That's it, your local dev environment is ready!
 ## 2. Adding Testing to your PR
 - Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
 - This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
 - Do not add real llm api calls to this directory.
 ### 2.1 File Naming Convention for `tests/litellm/`
 The `tests/litellm/` directory follows the same directory structure as `litellm/`.
 - `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
 - `test_{filename}.py` maps to `litellm/{filename}.py`
 ## 3. Running Unit Tests
 run the following command on the root of the litellm directory
 ```shell
 make test-unit
 ```
 ## 3.5 Running Linting Tests
 run the following command on the root of the litellm directory
 ```shell
 make lint
 ```
 LiteLLM uses mypy for linting. On ci/cd we also run `black` for formatting.
 ## 4. Submit a PR with your changes!
 - push your fork to your GitHub repo
 - submit a PR from there
 ## Advanced
 ### Building LiteLLM Docker Image 
 Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
 Step 1: Clone the repo
 ```shell
 git clone https://github.com/BerriAI/litellm.git
 ```
 Step 2: Build the Docker Image
 Build using Dockerfile.non_root
 ```shell
 docker build -f docker/Dockerfile.non_root -t litellm_test_image .
 ```
 Step 3: Run the Docker Image
 Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
 ```shell
 docker run \
    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
    -e DATABASE_URL="postgresql://xxxxxxxx" \
    -e LITELLM_MASTER_KEY="sk-1234" \
    -p 4000:4000 \
    litellm_test_image \
    --config /app/config.yaml --detailed_debug
 ```
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,7 +2,7 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';
-# Files API
+# /files
 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [Beta] Fine-tuning API
+# /fine_tuning
 :::info
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Moderation
+# /moderations
 ### Usage
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,6 +78,9 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
 * `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
 * `user_feedback: Optional[str]` - The end user’s feedback.
 * `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
 * `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.
 ## Using a self hosted deployment of Athina
--- a/docs/my-website/docs/projects/PDL.md
+++ b/docs/my-website/docs/projects/PDL.md
@ -0,0 +1,5 @@
 PDL - A YAML-based approach to prompt programming
 Github: https://github.com/IBM/prompt-declaration-language
 PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.
--- a/docs/my-website/docs/projects/pgai.md
+++ b/docs/my-website/docs/projects/pgai.md
@ -0,0 +1,9 @@
 # pgai
 [pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
 If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
 If you're already familiar with pgai, you can find litellm specific docs here:
 - Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
 - Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
--- a/docs/my-website/docs/providers/infinity.md
+++ b/docs/my-website/docs/providers/infinity.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Infinity
 | Property | Details |
@ -12,6 +15,9 @@
 ```python
 from litellm import rerank
 import os
 os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
 response = rerank(
    model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
 ```
 ## Supported Cohere Rerank API Params
 | Param | Type | Description |
 |-------|-------|-------|
 | `query` | `str` | The query to rerank the documents against |
 | `documents` | `list[str]` | The documents to rerank |
 | `top_n` | `int` | The number of documents to return |
 | `return_documents` | `bool` | Whether to return the documents in the response |
 ### Usage - Return Documents
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 response = rerank(
    model="infinity/rerank",
    query="What is the capital of France?",
    documents=["Paris", "London", "Berlin", "Madrid"],
    return_documents=True,
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```bash
 curl http://0.0.0.0:4000/rerank \
  -H "Authorization: Bearer sk-1234" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "custom-infinity-rerank",
    "query": "What is the capital of France?",
    "documents": [
        "Paris",
        "London",
        "Berlin",
        "Madrid"
    ],
    "return_documents": True,
  }'
 ```
 </TabItem>
 </Tabs>
 ## Pass Provider-specific Params
 Any unmapped params will be passed to the provider as-is.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import rerank
 import os
 os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
 response = rerank(
    model="infinity/rerank",
    query="What is the capital of France?",
    documents=["Paris", "London", "Berlin", "Madrid"],
    raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: custom-infinity-rerank
    litellm_params:
      model: infinity/rerank
      api_base: https://localhost:8080
      raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
 ```
 2. Start litellm
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it!  
 ```bash
 curl http://0.0.0.0:4000/rerank \
  -H "Authorization: Bearer sk-1234" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "custom-infinity-rerank",
    "query": "What is the capital of the United States?",
    "documents": [
        "Carson City is the capital city of the American state of Nevada.",
        "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
        "Washington, D.C. is the capital of the United States.",
        "Capital punishment has existed in the United States since before it was a country."
    ],
    "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
  }'
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/providers/snowflake.md
+++ b/docs/my-website/docs/providers/snowflake.md
@ -0,0 +1,90 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Snowflake
 | Property | Details |
 |-------|-------|
 | Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE function via HTTP POST requests|
 | Provider Route on LiteLLM | `snowflake/` |
 | Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) |
 | Base URL | [https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete/](https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete) |
 | Supported OpenAI Endpoints | `/chat/completions`, `/completions` |
 Currently, Snowflake's REST API does not have an endpoint for `snowflake-arctic-embed` embedding models. If you want to use these embedding models with Litellm, you can call them through our Hugging Face provider. 
 Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake/arctic-embed-661fd57d50fab5fc314e4c18) on Hugging Face.
 ## Supported OpenAI Parameters
 ```
    "temperature",
    "max_tokens",
    "top_p",
    "response_format"
 ```
 ## API KEYS
 Snowflake does have API keys. Instead, you access the Snowflake API with your JWT token and account identifier.
 ```python
 import os 
 os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
 os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
 ```
 ## Usage
 ```python
 from litellm import completion
 ## set ENV variables
 os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
 os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
 # Snowflake call
 response = completion(
    model="snowflake/mistral-7b", 
    messages = [{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
 ## Usage with LiteLLM Proxy 
 #### 1. Required env variables
 ```bash
 export SNOWFLAKE_JWT=""
 export SNOWFLAKE_ACCOUNT_ID = ""
 ```
 #### 2. Start the proxy~
 ```yaml
 model_list:
  - model_name: mistral-7b
    litellm_params:
        model: snowflake/mistral-7b
        api_key: YOUR_API_KEY
        api_base: https://YOUR-ACCOUNT-ID.snowflakecomputing.com/api/v2/cortex/inference:complete
 ```
 ```bash
 litellm --config /path/to/config.yaml
 ```
 #### 3. Test it
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "snowflake/mistral-7b",
      "messages": [
        {
          "role": "user",
          "content": "Hello, how are you?"
        }
      ]
    }
 '
 ```
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -404,14 +404,16 @@ curl http://localhost:4000/v1/chat/completions \
 If this was your initial VertexAI Grounding code,
 ```python
-import vertexai 
+import vertexai
 from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
 vertexai.init(project=project_id, location="us-central1")
 model = GenerativeModel("gemini-1.5-flash-001")
 # Use Google Search for grounding
-tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
+tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())
 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
@ -428,7 +430,7 @@ print(response)
 then, this is what it looks like now
 ```python
-from litellm import completion 
+from litellm import completion
 # !gcloud auth application-default login - run this to add vertex credentials to your env
@ -852,6 +854,7 @@ litellm.vertex_location = "us-central1 # Your Location
 | claude-3-5-sonnet@20240620  | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
 | claude-3-7-sonnet@20250219   | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |
 ### Usage
@ -926,6 +929,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>
 ### Usage - `thinking` / `reasoning_content`
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 resp = completion(
    model="vertex_ai/claude-3-7-sonnet-20250219",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
    thinking={"type": "enabled", "budget_tokens": 1024},
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 - model_name: claude-3-7-sonnet-20250219
  litellm_params:
    model: vertex_ai/claude-3-7-sonnet-20250219
    vertex_ai_project: "my-test-project"
    vertex_ai_location: "us-west-1"
 ```
 2. Start proxy
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
  -d '{
    "model": "claude-3-7-sonnet-20250219",
    "messages": [{"role": "user", "content": "What is the capital of France?"}],
    "thinking": {"type": "enabled", "budget_tokens": 1024}
  }'
 ```
 </TabItem>
 </Tabs>
 **Expected Response**
 ```python
 ModelResponse(
    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
    created=1740470510,
    model='claude-3-7-sonnet-20250219',
    object='chat.completion',
    system_fingerprint=None,
    choices=[
        Choices(
            finish_reason='stop',
            index=0,
            message=Message(
                content="The capital of France is Paris.",
                role='assistant',
                tool_calls=None,
                function_call=None,
                provider_specific_fields={
                    'citations': None,
                    'thinking_blocks': [
                        {
                            'type': 'thinking',
                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
                            'signature': 'EuYBCkQYAiJAy6...'
                        }
                    ]
                }
            ),
            thinking_blocks=[
                {
                    'type': 'thinking',
                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
                    'signature': 'EuYBCkQYAiJAy6AGB...'
                }
            ],
            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
        )
    ],
    usage=Usage(
        completion_tokens=68,
        prompt_tokens=42,
        total_tokens=110,
        completion_tokens_details=None,
        prompt_tokens_details=PromptTokensDetailsWrapper(
            audio_tokens=None,
            cached_tokens=0,
            text_tokens=None,
            image_tokens=None
        ),
        cache_creation_input_tokens=0,
        cache_read_input_tokens=0
    )
 )
 ```
 ## Llama 3 API
 | Model Name       | Function Call                        |
@ -1572,6 +1688,14 @@ assert isinstance(
 Pass any file supported by Vertex AI, through LiteLLM. 
 LiteLLM Supports the following image types passed in url
 ```
 Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
 Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
 Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
 Base64 Encoded Local Images
 ```
 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
 </TabItem>
 </Tabs>
 ## Send Video URL to VLLM
 Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
 There are two ways to send a video url to VLLM:
 1. Pass the video url directly
 ```
 {"type": "video_url", "video_url": {"url": video_url}},
 ```
 2. Pass the video data as base64
 ```
 {"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
 ```
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 response = completion(
            model="hosted_vllm/qwen", # pass the vllm model name
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Summarize the following video"
                        },
                        {
                            "type": "video_url",
                            "video_url": {
                                "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
                            }
                        }
                    ]
                }
            ],
            api_base="https://hosted-vllm-api.co")
 print(response)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
    - model_name: my-model
      litellm_params:
        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
 ```
 2. Start the proxy 
 ```bash
 $ litellm --config /path/to/config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it! 
 ```bash
 curl -X POST http://0.0.0.0:4000/chat/completions \
 -H "Authorization: Bearer sk-1234" \
 -H "Content-Type: application/json" \
 -d '{
    "model": "my-model",
    "messages": [
        {"role": "user", "content": 
            [
                {"type": "text", "text": "Summarize the following video"},
                {"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
            ]
        }
    ]
 }'
 ```
 </TabItem>
 </Tabs>
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`
--- a/docs/my-website/docs/proxy/access_control.md
+++ b/docs/my-website/docs/proxy/access_control.md
@ -10,17 +10,13 @@ Role-based access control (RBAC) is based on Organizations, Teams and Internal U
 ## Roles
-**Admin Roles**
+| Role Type | Role Name | Permissions |
-  - `proxy_admin`: admin over the platform
+|-----------|-----------|-------------|
-  - `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users
+| **Admin** | `proxy_admin` | Admin over the platform |
-
+| | `proxy_admin_viewer` | Can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users |
-**Organization Roles**
+| **Organization** | `org_admin` | Admin over the organization. Can create teams and users within their organization |
-  - `org_admin`: admin over the organization. Can create teams and users within their organization
+| **Internal User** | `internal_user` | Can login, view/create/delete their own keys, view their spend. **Cannot** add new users |
-
+| | `internal_user_viewer` | Can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users |
 **Internal User Roles**
  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
 ## Onboarding Organizations 
--- a/docs/my-website/docs/proxy/architecture.md
+++ b/docs/my-website/docs/proxy/architecture.md
@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
        - Virtual Key Rate Limit
        - User Rate Limit
        - Team Limit
-    - The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
+    - The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
 ## Frequently Asked Questions
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -499,6 +499,7 @@ router_settings:
 | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
 | SPEND_LOGS_URL | URL for retrieving spend logs
 | SSL_CERTIFICATE | Path to the SSL certificate file
 | SSL_SECURITY_LEVEL | [BETA] Security level for SSL/TLS connections. E.g. `DEFAULT@SECLEVEL=1`
 | SSL_VERIFY | Flag to enable or disable SSL certificate verification
 | SUPABASE_KEY | API key for Supabase service
 | SUPABASE_URL | Base URL for Supabase instance
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -448,6 +448,34 @@ model_list:
 s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for helping with this. 
 ### Centralized Credential Management
 Define credentials once and reuse them across multiple models. This helps with:
 - Secret rotation
 - Reducing config duplication
 ```yaml
 model_list:
  - model_name: gpt-4o
    litellm_params:
      model: azure/gpt-4o
      litellm_credential_name: default_azure_credential  # Reference credential below
 credential_list:
  - credential_name: default_azure_credential
    credential_values:
      api_key: os.environ/AZURE_API_KEY  # Load from environment
      api_base: os.environ/AZURE_API_BASE
      api_version: "2023-05-15"
    credential_info:
      description: "Production credentials for EU region"
 ```
 #### Key Parameters
 - `credential_name`: Unique identifier for the credential set
 - `credential_values`: Key-value pairs of credentials/secrets (supports `os.environ/` syntax)
 - `credential_info`: Key-value pairs of user provided credentials information.  No key-value pairs are required, but the dictionary must exist.
 ### Load API Keys from Secret Managers (Azure Vault, etc)
 [**Using Secret Managers with LiteLLM Proxy**](../secret)
@ -641,4 +669,4 @@ docker run --name litellm-proxy \
   ghcr.io/berriai/litellm-database:main-latest
 ```
 </TabItem>
-</Tabs>
+</Tabs>
--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma
 | Table Name | Description | Row Insert Frequency |
 |------------|-------------|---------------------|
-| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
+| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
 | LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |
-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## Disable `LiteLLM_SpendLogs`
 You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```
 ### What is the impact of disabling these logs?
--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -37,7 +37,7 @@ guardrails:
  - guardrail_name: aim-protected-app
    litellm_params:
      guardrail: aim
-      mode: pre_call # 'during_call' is also available
+      mode: [pre_call, post_call] # "During_call" is also available
      api_key: os.environ/AIM_API_KEY
      api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `api_base` | `Optional[str]` | Optional API base URL |
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
 | `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
 ## StandardLoggingModelInformation
--- a/docs/my-website/docs/proxy/master_key_rotations.md
+++ b/docs/my-website/docs/proxy/master_key_rotations.md
@ -0,0 +1,53 @@
 # Rotating Master Key
 Here are our recommended steps for rotating your master key.
 **1. Backup your DB**
 In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
 **2. Call `/key/regenerate` with the new master key**
 ```bash
 curl -L -X POST 'http://localhost:4000/key/regenerate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{
  "key": "sk-1234",
  "new_master_key": "sk-PIp1h0RekR"
 }'
 ```
 This will re-encrypt any models in your Proxy_ModelTable with the new master key.
 Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
 ```bash
   raise Exception("Unable to decrypt value={}".format(v))
 Exception: Unable to decrypt value=<new-encrypted-value>
 ```
 **3. Update LITELLM_MASTER_KEY**
 In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
 This ensures the key used for decryption from db is the new key.
 **4. Test it**
 Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
 ```bash
 curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
    "messages": [
        {
            "content": "Hey, how's it going",
            "role": "user"
        }
    ],
 }'
 ```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -107,9 +107,9 @@ general_settings:
 By default, LiteLLM writes several types of logs to the database:
 - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+- LLM Exceptions to the `LiteLLM_SpendLogs` table
-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:
 ```yaml
 general_settings:
--- a/docs/my-website/docs/proxy/release_cycle.md
+++ b/docs/my-website/docs/proxy/release_cycle.md
@ -0,0 +1,12 @@
 # Release Cycle
 Litellm Proxy has the following release cycle:
 - `v1.x.x-nightly`: These are releases which pass ci/cd. 
 - `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
 - `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
 In production, we recommend using the latest `v1.x.x` release.
 Follow our release notes [here](https://github.com/BerriAI/litellm/releases).
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -102,7 +102,19 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
 </TabItem>
 </Tabs>
-## Advanced - Set Accepted JWT Scope Names 
+## Advanced
 ### Multiple OIDC providers
 Use this if you want LiteLLM to validate your JWT against multiple OIDC providers (e.g. Google Cloud, GitHub Auth)
 Set `JWT_PUBLIC_KEY_URL` in your environment to a comma-separated list of URLs for your OIDC providers.
 ```bash
 export JWT_PUBLIC_KEY_URL="https://demo.duendesoftware.com/.well-known/openid-configuration/jwks,https://accounts.google.com/.well-known/openid-configuration/jwks"
 ```
 ### Set Accepted JWT Scope Names 
 Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.
@ -114,7 +126,7 @@ general_settings:
    admin_jwt_scope: "litellm-proxy-admin"
 ```
-## Tracking End-Users / Internal Users / Team / Org
+### Tracking End-Users / Internal Users / Team / Org
 Set the field in the jwt token, which corresponds to a litellm user / team / org.
@ -156,7 +168,7 @@ scope: ["litellm-proxy-admin",...]
 scope: "litellm-proxy-admin ..."
 ```
-## Control model access with Teams
+### Control model access with Teams
 1. Specify the JWT field that contains the team ids, that the user belongs to. 
@ -207,11 +219,11 @@ OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a
 - If all checks pass, allow the request
-## Advanced - Custom Validate
+### Custom JWT Validate
 Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.
-### 1. Setup custom validate function
+#### 1. Setup custom validate function
 ```python
 from typing import Literal
@ -230,7 +242,7 @@ def my_custom_validate(token: str) -> Literal[True]:
  return True
 ```
-### 2. Setup config.yaml
+#### 2. Setup config.yaml
 ```yaml
 general_settings:
@ -243,7 +255,7 @@ general_settings:
    custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
 ```
-### 3. Test the flow
+#### 3. Test the flow
 **Expected JWT**
@ -265,7 +277,7 @@ general_settings:
-## Advanced - Allowed Routes 
+### Allowed Routes 
 Configure which routes a JWT can access via the config.
@ -297,7 +309,7 @@ general_settings:
    team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
 ```
-## Advanced - Caching Public Keys 
+### Caching Public Keys 
 Control how long public keys are cached for (in seconds).
@ -311,7 +323,7 @@ general_settings:
    public_key_ttl: 600 # 👈 KEY CHANGE
 ```
-## Advanced - Custom JWT Field 
+### Custom JWT Field 
 Set a custom field in which the team_id exists. By default, the 'client_id' field is checked. 
@ -323,14 +335,7 @@ general_settings:
    team_id_jwt_field: "client_id" # 👈 KEY CHANGE
 ```
-## All Params
+### Block Teams 
 [**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
 ## Advanced - Block Teams 
 To block all requests for a certain team id, use `/team/block`
@ -357,7 +362,7 @@ curl --location 'http://0.0.0.0:4000/team/unblock' \
 ```
-## Advanced - Upsert Users + Allowed Email Domains 
+### Upsert Users + Allowed Email Domains 
 Allow users who belong to a specific email domain, automatic access to the proxy.
@ -494,4 +499,10 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
    }
  ]
 }'
-```
+```
 ## All JWT Params
 [**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
--- a/docs/my-website/docs/proxy/ui_credentials.md
+++ b/docs/my-website/docs/proxy/ui_credentials.md
@ -0,0 +1,55 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Adding LLM Credentials
 You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
 ## Add a credential + model
 ### 1. Navigate to LLM Credentials page
 Go to Models -> LLM Credentials -> Add Credential
 <Image img={require('../../img/ui_cred_add.png')} />
 ### 2. Add credentials
 Select your LLM provider, enter your API Key and click "Add Credential"
 **Note: Credentials are based on the provider, if you select Vertex AI then you will see `Vertex Project`, `Vertex Location` and `Vertex Credentials` fields**
 <Image img={require('../../img/ui_add_cred_2.png')} />
 ### 3. Use credentials when adding a model
 Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
 <Image img={require('../../img/ui_cred_3.png')} />
 ## Create a Credential from an existing model
 Use this if you have already created a model and want to store the model credentials for future use
 ### 1. Select model to create a credential from
 Go to Models -> Select your model -> Credential -> Create Credential
 <Image img={require('../../img/ui_cred_4.png')} />
 ### 2. Use new credential when adding a model
 Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
 <Image img={require('../../img/use_model_cred.png')} />
 ## Frequently Asked Questions
 How are credentials stored?
 Credentials in the DB are encrypted/decrypted using `LITELLM_SALT_KEY`, if set. If not, then they are encrypted using `LITELLM_MASTER_KEY`. These keys should be kept secret and not shared with others.
--- a/docs/my-website/docs/proxy/ui_logs.md
+++ b/docs/my-website/docs/proxy/ui_logs.md
@ -0,0 +1,55 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # UI Logs Page
 View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM
 <Image img={require('../../img/ui_request_logs.png')}/>
 ## Overview
 | Log Type | Tracked by Default |
 |----------|-------------------|
 | Success Logs | ✅ Yes |
 | Error Logs | ✅ Yes |
 | Request/Response Content Stored | ❌ No by Default, **opt in with `store_prompts_in_spend_logs`** |
 **By default LiteLLM does not track the request and response content.**
 ## Tracking - Request / Response Content in Logs Page 
 If you want to view request and response content on LiteLLM Logs, you need to opt in with this setting
 ```yaml
 general_settings:
  store_prompts_in_spend_logs: true
 ```
 <Image img={require('../../img/ui_request_logs_content.png')}/>
 ## Stop storing Error Logs in DB
 If you do not want to store error logs in DB, you can opt out with this setting
 ```yaml
 general_settings:
  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```
 ## Stop storing Spend Logs in DB
 If you do not want to store spend logs in DB, you can opt out with this setting
 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
 ```
--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Realtime Endpoints
+# /realtime
 Use this to loadbalance across Azure + OpenAI. 
--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -3,11 +3,20 @@ import TabItem from '@theme/TabItem';
 # 'Thinking' / 'Reasoning Content'
 :::info
 Requires LiteLLM v1.63.0+
 :::
 Supported Providers:
 - Deepseek (`deepseek/`)
 - Anthropic API (`anthropic/`)
- Bedrock (Anthropic) (`bedrock/`)
+- Bedrock (Anthropic + Deepseek) (`bedrock/`)
 - Vertex AI (Anthropic) (`vertexai/`)
 - OpenRouter (`openrouter/`)
 LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.
 ```python
 "message": {
@ -17,7 +26,7 @@ Supported Providers:
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
        }
    ]
 }
@ -95,13 +104,263 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 }
 ```
 ## Tool Calling with `thinking`
 Here's how to use `thinking` blocks by Anthropic with tool calling.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 litellm._turn_on_debug()
 litellm.modify_params = True
 model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
 # Step 1: send the conversation and available functions to the model
 messages = [
    {
        "role": "user",
        "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
    }
 ]
 tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                    },
                },
                "required": ["location"],
            },
        },
    }
 ]
 response = litellm.completion(
    model=model,
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
    thinking={"type": "enabled", "budget_tokens": 1024},
 )
 print("Response\n", response)
 response_message = response.choices[0].message
 tool_calls = response_message.tool_calls
 print("Expecting there to be 3 tool calls")
 assert (
    len(tool_calls) > 0
 )  # this has to call the function for SF, Tokyo and paris
 # Step 2: check if the model wanted to call a function
 print(f"tool_calls: {tool_calls}")
 if tool_calls:
    # Step 3: call the function
    # Note: the JSON response may not always be valid; be sure to handle errors
    available_functions = {
        "get_current_weather": get_current_weather,
    }  # only one function in this example, but you can have multiple
    messages.append(
        response_message
    )  # extend conversation with assistant's reply
    print("Response message\n", response_message)
    # Step 4: send the info for each function call and function response to the model
    for tool_call in tool_calls:
        function_name = tool_call.function.name
        if function_name not in available_functions:
            # the model called a function that does not exist in available_functions - don't try calling anything
            return
        function_to_call = available_functions[function_name]
        function_args = json.loads(tool_call.function.arguments)
        function_response = function_to_call(
            location=function_args.get("location"),
            unit=function_args.get("unit"),
        )
        messages.append(
            {
                "tool_call_id": tool_call.id,
                "role": "tool",
                "name": function_name,
                "content": function_response,
            }
        )  # extend conversation with function response
    print(f"messages: {messages}")
    second_response = litellm.completion(
        model=model,
        messages=messages,
        seed=22,
        # tools=tools,
        drop_params=True,
        thinking={"type": "enabled", "budget_tokens": 1024},
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: claude-3-7-sonnet-thinking
    litellm_params:
      model: anthropic/claude-3-7-sonnet-20250219
      api_key: os.environ/ANTHROPIC_API_KEY
      thinking: {
        "type": "enabled",
        "budget_tokens": 1024
      }
 ```
 2. Run proxy
 ```bash
 litellm --config config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Make 1st call
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $LITELLM_KEY" \
  -d '{
    "model": "claude-3-7-sonnet-thinking",
    "messages": [
      {"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
    ],
    "tools": [
        {
          "type": "function",
          "function": {
              "name": "get_current_weather",
              "description": "Get the current weather in a given location",
              "parameters": {
                  "type": "object",
                  "properties": {
                      "location": {
                          "type": "string",
                          "description": "The city and state",
                      },
                      "unit": {
                          "type": "string",
                          "enum": ["celsius", "fahrenheit"],
                      },
                  },
                  "required": ["location"],
              },
          },
        }
    ],
    "tool_choice": "auto"
  }'
 ```
 4. Make 2nd call with tool call results
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $LITELLM_KEY" \
  -d '{
    "model": "claude-3-7-sonnet-thinking",
    "messages": [
      {
        "role": "user",
        "content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
      },
      {
        "role": "assistant",
        "content": "I\'ll check the current weather for these three cities for you:",
        "tool_calls": [
          {
            "index": 2,
            "function": {
              "arguments": "{\"location\": \"San Francisco\"}",
              "name": "get_current_weather"
            },
            "id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
            "type": "function"
          }
        ],
        "function_call": null,
        "reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
        "thinking_blocks": [
          {
            "type": "thinking",
            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
            "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
          }
        ],
        "provider_specific_fields": {
          "reasoningContentBlocks": [
            {
              "reasoningText": {
                "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
                "text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
              }
            }
          ]
        }
      },
      {
        "tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
        "role": "tool",
        "name": "get_current_weather",
        "content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
      }
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 ## Switching between Anthropic + Deepseek models 
 Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
 ```python
 litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
 # or per request
 ## Anthropic
 response = litellm.completion(
  model="anthropic/claude-3-7-sonnet-20250219",
  messages=[{"role": "user", "content": "What is the capital of France?"}],
  thinking={"type": "enabled", "budget_tokens": 1024},
  drop_params=True,
 )
 ## Deepseek
 response = litellm.completion(
  model="deepseek/deepseek-chat",
  messages=[{"role": "user", "content": "What is the capital of France?"}],
  thinking={"type": "enabled", "budget_tokens": 1024},
  drop_params=True,
 )
 ```
 ## Spec 
 These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
 - `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
 - `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
  - `type` - str: The type of thinking block.
  - `thinking` - str: The thinking from the model.
-  - `signature_delta` - str: The signature delta from the model.
+  - `signature` - str: The signature delta from the model.
--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -1,4 +1,4 @@
-# Rerank
+# /rerank
 :::tip
--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -0,0 +1,117 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # /responses [Beta]
 LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](https://platform.openai.com/docs/api-reference/responses)
 | Feature | Supported | Notes |
 |---------|-----------|--------|
 | Cost Tracking | ✅ | Works with all supported models |
 | Logging | ✅ | Works across all integrations |
 | End-user Tracking | ✅ | |
 | Streaming | ✅ | |
 | Fallbacks | ✅ | Works between supported models |
 | Loadbalancing | ✅ | Works between supported models |
 | Supported LiteLLM Versions | 1.63.8+ | |
 | Supported LLM providers | `openai` | |
 ## Usage
 ## Create a model response
 <Tabs>
 <TabItem value="litellm-sdk" label="LiteLLM SDK">
 #### Non-streaming
 ```python
 import litellm
 # Non-streaming response
 response = litellm.responses(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    max_output_tokens=100
 )
 print(response)
 ```
 #### Streaming
 ```python
 import litellm
 # Streaming response
 response = litellm.responses(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
 for event in response:
    print(event)
 ```
 </TabItem>
 <TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
 First, add this to your litellm proxy config.yaml:
 ```yaml
 model_list:
  - model_name: gpt-4o
    litellm_params:
      model: openai/gpt-4o
      api_key: os.environ/OPENAI_API_KEY
 ```
 Start your LiteLLM proxy:
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```
 Then use the OpenAI SDK pointed to your proxy:
 #### Non-streaming
 ```python
 from openai import OpenAI
 # Initialize client with your proxy URL
 client = OpenAI(
    base_url="http://localhost:4000",  # Your proxy URL
    api_key="your-api-key"             # Your proxy API key
 )
 # Non-streaming response
 response = client.responses.create(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn."
 )
 print(response)
 ```
 #### Streaming
 ```python
 from openai import OpenAI
 # Initialize client with your proxy URL
 client = OpenAI(
    base_url="http://localhost:4000",  # Your proxy URL
    api_key="your-api-key"             # Your proxy API key
 )
 # Streaming response
 response = client.responses.create(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
 for event in response:
    print(event)
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -830,7 +830,7 @@ asyncio.run(router_acompletion())
 Set `weight` on a deployment to pick one deployment more often than others. 
-This works across **ALL** routing strategies. 
+This works across **simple-shuffle** routing strategy (this is the default, if no routing strategy is selected). 
 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -952,8 +952,8 @@ router_settings:
 ```
 Defaults:
- allowed_fails: 0
+- allowed_fails: 3
- cooldown_time: 60s 
+- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)
 **Set Per Model**
--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
 ```
 #### Using K/V pairs in 1 AWS Secret
 You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
 ```yaml
 general_settings:
  key_management_system: "aws_secret_manager"
  key_management_settings:
    hosted_keys: [
      "OPENAI_API_KEY_MODEL_1",
      "OPENAI_API_KEY_MODEL_2",
    ]
    primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
 ```
 The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
 ```json
 {
  "OPENAI_API_KEY_MODEL_1": "sk-key1...",
  "OPENAI_API_KEY_MODEL_2": "sk-key2..."
 }
 ```
 This reduces the number of AWS Secrets you need to manage.
 ## Hashicorp Vault
@ -353,4 +380,7 @@ general_settings:
    # Hosted Keys Settings
    hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
    # K/V pairs in 1 AWS Secret Settings
    primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
 ```
--- a/docs/my-website/docs/text_completion.md
+++ b/docs/my-website/docs/text_completion.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Text Completion
+# /completions
 ### Usage
 <Tabs>
--- a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
+++ b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Use LiteLLM AI Gateway with Aporia Guardrails
+# Aporia Guardrails with LiteLLM Gateway
-In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
+In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses
 ## 1. Setup guardrails on Aporia
--- a/docs/my-website/docs/tutorials/openweb_ui.md
+++ b/docs/my-website/docs/tutorials/openweb_ui.md
@ -0,0 +1,103 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # OpenWeb UI with LiteLLM
 This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to 
 - Access 100+ LLMs on OpenWeb UI
 - Track Spend / Usage, Set Budget Limits 
 - Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
 - Set access controls eg. Control what models OpenWebUI can access.
 ## Quickstart
 - Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
 ## 1. Start LiteLLM & OpenWebUI
 - OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
 - LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
 ## 2. Create a Virtual Key on LiteLLM
 Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
 ### 2.1 LiteLLM User Management Hierarchy
 On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
 - `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
 - `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
 - `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
 - `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
 Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
 ### 2.2 Create a Team on LiteLLM
 Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
 <Image img={require('../../img/litellm_create_team.gif')} />
 ### 2.2 Create a Virtual Key on LiteLLM
 Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key. 
 LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
 <Image img={require('../../img/create_key_in_team_oweb.gif')} />
 ## 3. Connect OpenWeb UI to LiteLLM
 On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
 Enter the following details:
 - URL: `http://localhost:4000` (your litellm proxy base url)
 - Key: `your-virtual-key` (the key you created in the previous step)
 <Image img={require('../../img/litellm_setup_openweb.gif')} />
 ### 3.1 Test Request
 On the top left corner, select models you should only see the models you gave the key access to in Step 2.
 Once you selected a model, enter your message content and click on `Submit`
 <Image img={require('../../img/basic_litellm.gif')} />
 ### 3.2 Tracking Spend / Usage
 After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
 <!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
 ## Render `thinking` content on OpenWeb UI
 OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
 Example litellm config.yaml:
 ```yaml
 model_list:
  - model_name: thinking-anthropic-claude-3-7-sonnet
    litellm_params:
      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
      thinking: {"type": "enabled", "budget_tokens": 1024}
      max_tokens: 1080
      merge_reasoning_content_in_choices: true
 ```
 ### Test it on OpenWeb UI
 On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
 <Image img={require('../../img/litellm_thinking_openweb.gif')} />
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -44,7 +44,7 @@ const config = {
        path: './release_notes',
        routeBasePath: 'release_notes',
        blogTitle: 'Release Notes',
-        blogSidebarTitle: 'All Releases',
+        blogSidebarTitle: 'Releases',
        blogSidebarCount: 'ALL',
        postsPerPage: 'ALL',
        showReadingTime: false,
--- a/docs/my-website/img/basic_litellm.gif
+++ b/docs/my-website/img/basic_litellm.gif
--- a/docs/my-website/img/create_key_in_team_oweb.gif
+++ b/docs/my-website/img/create_key_in_team_oweb.gif
--- a/docs/my-website/img/litellm_create_team.gif
+++ b/docs/my-website/img/litellm_create_team.gif
--- a/docs/my-website/img/litellm_setup_openweb.gif
+++ b/docs/my-website/img/litellm_setup_openweb.gif
--- a/docs/my-website/img/litellm_thinking_openweb.gif
+++ b/docs/my-website/img/litellm_thinking_openweb.gif
--- a/docs/my-website/img/release_notes/anthropic_thinking.jpg
+++ b/docs/my-website/img/release_notes/anthropic_thinking.jpg
--- a/docs/my-website/img/release_notes/credentials.jpg
+++ b/docs/my-website/img/release_notes/credentials.jpg
--- a/docs/my-website/img/release_notes/error_logs.jpg
+++ b/docs/my-website/img/release_notes/error_logs.jpg
--- a/docs/my-website/img/release_notes/litellm_test_connection.gif
+++ b/docs/my-website/img/release_notes/litellm_test_connection.gif
--- a/docs/my-website/img/release_notes/responses_api.png
+++ b/docs/my-website/img/release_notes/responses_api.png
--- a/docs/my-website/img/release_notes/v1632_release.jpg
+++ b/docs/my-website/img/release_notes/v1632_release.jpg
--- a/docs/my-website/img/ui_add_cred_2.png
+++ b/docs/my-website/img/ui_add_cred_2.png
--- a/docs/my-website/img/ui_cred_3.png
+++ b/docs/my-website/img/ui_cred_3.png
--- a/docs/my-website/img/ui_cred_4.png
+++ b/docs/my-website/img/ui_cred_4.png
--- a/docs/my-website/img/ui_cred_add.png
+++ b/docs/my-website/img/ui_cred_add.png
--- a/docs/my-website/img/ui_request_logs.png
+++ b/docs/my-website/img/ui_request_logs.png
--- a/docs/my-website/img/ui_request_logs_content.png
+++ b/docs/my-website/img/ui_request_logs_content.png
--- a/docs/my-website/img/use_model_cred.png
+++ b/docs/my-website/img/use_model_cred.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -706,12 +706,13 @@
      }
    },
    "node_modules/@babel/helpers": {
-      "version": "7.26.0",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz",
-      "integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==",
+      "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==",
      "license": "MIT",
      "dependencies": {
-        "@babel/template": "^7.25.9",
+        "@babel/template": "^7.26.9",
-        "@babel/types": "^7.26.0"
+        "@babel/types": "^7.26.10"
      },
      "engines": {
        "node": ">=6.9.0"
@ -796,11 +797,12 @@
      }
    },
    "node_modules/@babel/parser": {
-      "version": "7.26.3",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.3.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.10.tgz",
-      "integrity": "sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==",
+      "integrity": "sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==",
      "license": "MIT",
      "dependencies": {
-        "@babel/types": "^7.26.3"
+        "@babel/types": "^7.26.10"
      },
      "bin": {
        "parser": "bin/babel-parser.js"
@ -2157,9 +2159,10 @@
      }
    },
    "node_modules/@babel/runtime-corejs3": {
-      "version": "7.26.0",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.0.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz",
-      "integrity": "sha512-YXHu5lN8kJCb1LOb9PgV6pvak43X2h4HvRApcN5SdWeaItQOzfn1hgP6jasD6KWQyJDBxrVmA9o9OivlnNJK/w==",
+      "integrity": "sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==",
      "license": "MIT",
      "dependencies": {
        "core-js-pure": "^3.30.2",
        "regenerator-runtime": "^0.14.0"
@ -2169,13 +2172,14 @@
      }
    },
    "node_modules/@babel/template": {
-      "version": "7.25.9",
+      "version": "7.26.9",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.26.9.tgz",
-      "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==",
+      "integrity": "sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==",
      "license": "MIT",
      "dependencies": {
-        "@babel/code-frame": "^7.25.9",
+        "@babel/code-frame": "^7.26.2",
-        "@babel/parser": "^7.25.9",
+        "@babel/parser": "^7.26.9",
-        "@babel/types": "^7.25.9"
+        "@babel/types": "^7.26.9"
      },
      "engines": {
        "node": ">=6.9.0"
@ -2199,9 +2203,10 @@
      }
    },
    "node_modules/@babel/types": {
-      "version": "7.26.3",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.3.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.10.tgz",
-      "integrity": "sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==",
+      "integrity": "sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==",
      "license": "MIT",
      "dependencies": {
        "@babel/helper-string-parser": "^7.25.9",
        "@babel/helper-validator-identifier": "^7.25.9"
--- a/docs/my-website/release_notes/v1.57.8-stable/index.md
+++ b/docs/my-website/release_notes/v1.57.8-stable/index.md
@ -18,13 +18,6 @@ hide_table_of_contents: false
 `alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`
 :::note
 v1.57.8-stable, is currently being tested. It will be released on 2025-01-12. 
 :::
 ## New / Updated Models
 1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452
--- a/docs/my-website/release_notes/v1.61.20-stable/index.md
+++ b/docs/my-website/release_notes/v1.61.20-stable/index.md
@ -0,0 +1,103 @@
 ---
 title: v1.61.20-stable
 slug: v1.61.20-stable
 date: 2025-03-01T10:00:00
 authors:
  - name: Krrish Dholakia
    title: CEO, LiteLLM
    url: https://www.linkedin.com/in/krish-d/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
  - name: Ishaan Jaffer
    title: CTO, LiteLLM
    url: https://www.linkedin.com/in/reffajnaahsi/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
 tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
 hide_table_of_contents: false
 ---
 import Image from '@theme/IdealImage';
 # v1.61.20-stable
 These are the changes since `v1.61.13-stable`.
 This release is primarily focused on:
 - LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
 - UI improvements (add model flow, user management, etc)
 ## Demo Instance
 Here's a Demo Instance to test changes:
 - Instance: https://demo.litellm.ai/
 - Login Credentials:
    - Username: admin
    - Password: sk-1234
 ## New Models / Updated Models
 1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter) 
    1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
    2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
    3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
    4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
 2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
 3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
 4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
 5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
 6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
 ## LLM Translation
 1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
 2. Amazon Deepseek - `<think>` param extraction into ‘reasoning_content’ [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
 3. Amazon Titan Embeddings - filter out ‘aws_’ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
 4. Anthropic ‘thinking’ + ‘reasoning_content’ translation support (Anthropic API, Bedrock, Vertex AI)  [Start here](https://docs.litellm.ai/docs/reasoning_content)
 5. VLLM - support ‘video_url’ [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
 6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
 7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
 8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
 9. O1/O3 - support ‘drop_params’ for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
 ## Spend Tracking Improvements
 1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
 2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
 3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
 ## Management Endpoints / UI
 1. Models Page - Allow sorting models by ‘created at’
 2. Models Page - Edit Model Flow Improvements
 3. Models Page - Fix Adding Azure, Azure AI Studio models on UI 
 4. Internal Users Page - Allow Bulk Adding Internal Users on UI 
 5. Internal Users Page - Allow sorting users by ‘created at’ 
 6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
 7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
 8. Model Hub Page  - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
 9. Admin Settings Page - Allow adding MSFT SSO on UI 
 10. Backend - don't allow creating duplicate internal users in DB
 ## Helm
 1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
 2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
 ## Logging / Guardrail Integrations
 1. Arize Phoenix support 
 2. ‘No-log’ - fix ‘no-log’ param support on embedding calls 
 ## Performance / Loadbalancing / Reliability improvements
 1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
 ## General Proxy Improvements
 1. Hypercorn - fix reading / parsing request body 
 2. Windows - fix running proxy in windows 
 3. DD-Trace - fix dd-trace enablement on proxy
 ## Complete Git Diff
 View the complete git diff [here](https://github.com/BerriAI/litellm/compare/v1.61.13-stable...v1.61.20-stable).
--- a/docs/my-website/release_notes/v1.63.0/index.md
+++ b/docs/my-website/release_notes/v1.63.0/index.md
@ -0,0 +1,40 @@
 ---
 title: v1.63.0 - Anthropic 'thinking' response update
 slug: v1.63.0
 date: 2025-03-05T10:00:00
 authors:
  - name: Krrish Dholakia
    title: CEO, LiteLLM
    url: https://www.linkedin.com/in/krish-d/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
  - name: Ishaan Jaffer
    title: CTO, LiteLLM
    url: https://www.linkedin.com/in/reffajnaahsi/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
 tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
 hide_table_of_contents: false
 ---
 v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
 It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
 ## Diff 
 ```bash
 "message": {
    ...
    "reasoning_content": "The capital of France is Paris.",
    "thinking_blocks": [
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
 -            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
 +            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
        }
    ]
 }
 ```
--- a/docs/my-website/release_notes/v1.63.11-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.11-stable/index.md
@ -0,0 +1,180 @@
 ---
 title: v1.63.11-stable
 slug: v1.63.11-stable
 date: 2025-03-15T10:00:00
 authors:
  - name: Krrish Dholakia
    title: CEO, LiteLLM
    url: https://www.linkedin.com/in/krish-d/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
  - name: Ishaan Jaffer
    title: CTO, LiteLLM
    url: https://www.linkedin.com/in/reffajnaahsi/
    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
 tags: [credential management, thinking content, responses api, snowflake]
 hide_table_of_contents: false
 ---
 import Image from '@theme/IdealImage';
 These are the changes since `v1.63.2-stable`.
 This release is primarily focused on:
 - [Beta] Responses API Support
 - Snowflake Cortex Support, Amazon Nova Image Generation
 - UI - Credential Management, re-use credentials when adding new models
 - UI - Test Connection to LLM Provider before adding a model
 :::info
 This release will be live on 03/16/2025
 :::
 <!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
 ## Known Issues
 - 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test
 ## Docker Run LiteLLM Proxy
 ```
 docker run
 -e STORE_MODEL_IN_DB=True
 -p 4000:4000
 ghcr.io/berriai/litellm:main-v1.63.11-stable
 ```
 ## Demo Instance
 Here's a Demo Instance to test changes:
 - Instance: https://demo.litellm.ai/
 - Login Credentials:
    - Username: admin
    - Password: sk-1234
 ## New Models / Updated Models
 - Image Generation support for Amazon Nova Canvas [Getting Started](https://docs.litellm.ai/docs/providers/bedrock#image-generation)
 - Add pricing for Jamba new models [PR](https://github.com/BerriAI/litellm/pull/9032/files)
 - Add pricing for Amazon EU models [PR](https://github.com/BerriAI/litellm/pull/9056/files)
 - Add Bedrock Deepseek R1 model pricing [PR](https://github.com/BerriAI/litellm/pull/9108/files)
 - Update Gemini pricing: Gemma 3, Flash 2 thinking update, LearnLM [PR](https://github.com/BerriAI/litellm/pull/9190/files)
 - Mark Cohere Embedding 3 models as Multimodal [PR](https://github.com/BerriAI/litellm/pull/9176/commits/c9a576ce4221fc6e50dc47cdf64ab62736c9da41)
 - Add Azure Data Zone pricing [PR](https://github.com/BerriAI/litellm/pull/9185/files#diff-19ad91c53996e178c1921cbacadf6f3bae20cfe062bd03ee6bfffb72f847ee37)
   - LiteLLM Tracks cost for `azure/eu` and `azure/us` models
 ## LLM Translation
 <Image img={require('../../img/release_notes/responses_api.png')} />
 1. **New Endpoints**
 - [Beta] POST `/responses` API. [Getting Started](https://docs.litellm.ai/docs/response_api)
 2. **New LLM Providers**
 - Snowflake Cortex [Getting Started](https://docs.litellm.ai/docs/providers/snowflake)
 3. **New LLM Features**
 - Support OpenRouter `reasoning_content` on streaming [Getting Started](https://docs.litellm.ai/docs/reasoning_content)
 4. **Bug Fixes**
 - OpenAI: Return `code`, `param` and `type` on bad request error [More information on litellm exceptions](https://docs.litellm.ai/docs/exception_mapping)
 - Bedrock: Fix converse chunk parsing to only return empty dict on tool use [PR](https://github.com/BerriAI/litellm/pull/9166)
 - Bedrock: Support extra_headers [PR](https://github.com/BerriAI/litellm/pull/9113)
 - Azure: Fix Function Calling Bug & Update Default API Version to `2025-02-01-preview` [PR](https://github.com/BerriAI/litellm/pull/9191)
 - Azure: Fix AI services URL [PR](https://github.com/BerriAI/litellm/pull/9185)
 - Vertex AI: Handle HTTP 201 status code in response [PR](https://github.com/BerriAI/litellm/pull/9193)
 - Perplexity: Fix incorrect streaming response [PR](https://github.com/BerriAI/litellm/pull/9081)
 - Triton: Fix streaming completions bug [PR](https://github.com/BerriAI/litellm/pull/8386)
 - Deepgram: Support bytes.IO when handling audio files for transcription [PR](https://github.com/BerriAI/litellm/pull/9071)
 - Ollama: Fix "system" role has become unacceptable [PR](https://github.com/BerriAI/litellm/pull/9261)
 - All Providers (Streaming): Fix String `data:` stripped from entire content in streamed responses [PR](https://github.com/BerriAI/litellm/pull/9070)
 ## Spend Tracking Improvements
 1. Support Bedrock converse cache token tracking [Getting Started](https://docs.litellm.ai/docs/completion/prompt_caching)
 2. Cost Tracking for Responses API [Getting Started](https://docs.litellm.ai/docs/response_api)
 3. Fix Azure Whisper cost tracking [Getting Started](https://docs.litellm.ai/docs/audio_transcription)
 ## UI
 ### Re-Use Credentials on UI
 You can now onboard LLM provider credentials on LiteLLM UI. Once these credentials are added you can re-use them when adding new models [Getting Started](https://docs.litellm.ai/docs/proxy/ui_credentials)
 <Image img={require('../../img/release_notes/credentials.jpg')} />
 ### Test Connections before adding models
 Before adding a model you can test the connection to the LLM provider to verify you have setup your API Base + API Key correctly
 <Image img={require('../../img/release_notes/litellm_test_connection.gif')} />
 ### General UI Improvements
 1. Add Models Page
   - Allow adding Cerebras, Sambanova, Perplexity, Fireworks, Openrouter, TogetherAI Models, Text-Completion OpenAI on Admin UI
   - Allow adding EU OpenAI models
   - Fix: Instantly show edit + deletes to models
 2. Keys Page
   - Fix: Instantly show newly created keys on Admin UI (don't require refresh)
   - Fix: Allow clicking into Top Keys when showing users Top API Key
   - Fix: Allow Filter Keys by Team Alias, Key Alias and Org
   - UI Improvements: Show 100 Keys Per Page, Use full height, increase width of key alias
 3. Users Page
   - Fix: Show correct count of internal user keys on Users Page
   - Fix: Metadata not updating in Team UI
 4. Logs Page
   - UI Improvements: Keep expanded log in focus on LiteLLM UI
   - UI Improvements: Minor improvements to logs page
   - Fix: Allow internal user to query their own logs
   - Allow switching off storing Error Logs in DB [Getting Started](https://docs.litellm.ai/docs/proxy/ui_logs)
 5. Sign In/Sign Out
   - Fix: Correctly use `PROXY_LOGOUT_URL` when set [Getting Started](https://docs.litellm.ai/docs/proxy/self_serve#setting-custom-logout-urls)
 ## Security
 1. Support for Rotating Master Keys [Getting Started](https://docs.litellm.ai/docs/proxy/master_key_rotations)
 2. Fix: Internal User Viewer Permissions, don't allow `internal_user_viewer` role to see `Test Key Page` or `Create Key Button` [More information on role based access controls](https://docs.litellm.ai/docs/proxy/access_control)
 3. Emit audit logs on All user + model Create/Update/Delete endpoints [Getting Started](https://docs.litellm.ai/docs/proxy/multiple_admins)
 4. JWT
    - Support multiple JWT OIDC providers [Getting Started](https://docs.litellm.ai/docs/proxy/token_auth)
    - Fix JWT access with Groups not working when team is assigned All Proxy Models access
 5. Using K/V pairs in 1 AWS Secret [Getting Started](https://docs.litellm.ai/docs/secret#using-kv-pairs-in-1-aws-secret)
 ## Logging Integrations
 1. Prometheus: Track Azure LLM API latency metric [Getting Started](https://docs.litellm.ai/docs/proxy/prometheus#request-latency-metrics)
 2. Athina: Added tags, user_feedback and model_options to additional_keys which can be sent to Athina [Getting Started](https://docs.litellm.ai/docs/observability/athina_integration)
 ## Performance / Reliability improvements
 1. Redis + litellm router - Fix Redis cluster mode for litellm router [PR](https://github.com/BerriAI/litellm/pull/9010)
 ## General Improvements
 1. OpenWebUI Integration - display `thinking` tokens
 - Guide on getting started with LiteLLM x OpenWebUI. [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui)
 - Display `thinking` tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui#render-thinking-content-on-openweb-ui)
 <Image img={require('../../img/litellm_thinking_openweb.gif')} />
 ## Complete Git Diff
 [Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.2-stable...v1.63.11-stable)
--- a/docs/my-website/release_notes/v1.63.2-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.2-stable/index.md
@ -0,0 +1,112 @@
 ---
 title: v1.63.2-stable
 slug: v1.63.2-stable
 date: 2025-03-08T10:00:00
 authors:
  - name: Krrish Dholakia
    title: CEO, LiteLLM
    url: https://www.linkedin.com/in/krish-d/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
  - name: Ishaan Jaffer
    title: CTO, LiteLLM
    url: https://www.linkedin.com/in/reffajnaahsi/
    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
 tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
 hide_table_of_contents: false
 ---
 import Image from '@theme/IdealImage';
 These are the changes since `v1.61.20-stable`.
 This release is primarily focused on:
 - LLM Translation improvements (more `thinking` content improvements)
 - UI improvements (Error logs now shown on UI)
 :::info
 This release will be live on 03/09/2025
 ::: 
 <Image img={require('../../img/release_notes/v1632_release.jpg')} />
 ## Demo Instance
 Here's a Demo Instance to test changes:
 - Instance: https://demo.litellm.ai/
 - Login Credentials:
    - Username: admin
    - Password: sk-1234
 ## New Models / Updated Models
 1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
 2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
 3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
 ## LLM Translation
 <Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
 1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
 2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
 3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
 4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
 5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
 6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
 7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
 8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
 9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
 10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
 - Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
 11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
 12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
 - Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API. 
 13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
 ## Spend Tracking Improvements
 1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
 2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
 ## Management Endpoints / UI
 <Image img={require('../../img/release_notes/error_logs.jpg')} />
 1. Virtual Keys Page
    - Allow team/org filters to be searchable on the Create Key Page
    - Add created_by and updated_by fields to Keys table
    - Show 'user_email' on key table
    - Show 100 Keys Per Page, Use full height, increase width of key alias
 2. Logs Page
    - Show Error Logs on LiteLLM UI
    - Allow Internal Users to View their own logs
 3. Internal Users Page 
    - Allow admin to control default model access for internal users
 7. Fix session handling with cookies
 ## Logging / Guardrail Integrations
 1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
 ## Performance / Loadbalancing / Reliability improvements
 1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
 2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
 ## General Proxy Improvements
 1. Raise BadRequestError when unknown model passed in request
 2. Enforce model access restrictions on Azure OpenAI proxy route
 3. Reliability fix - Handle emoji’s in text - fix orjson error
 4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
 5. Enable setting timezone information in docker image 
 ## Complete Git Diff
 [Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -41,10 +41,12 @@ const sidebars = {
            "proxy/deploy",
            "proxy/prod",
            "proxy/cli",
            "proxy/release_cycle",
            "proxy/model_management",
            "proxy/health",
            "proxy/debugging",
            "proxy/spending_monitoring",
            "proxy/master_key_rotations",
          ],
        },
        "proxy/demo",
@ -99,7 +101,9 @@ const sidebars = {
            "proxy/admin_ui_sso",
            "proxy/self_serve",
            "proxy/public_teams",
-            "proxy/custom_sso"
+            "proxy/custom_sso",
            "proxy/ui_credentials",
            "proxy/ui_logs"
          ],
        },
        {
@ -229,6 +233,7 @@ const sidebars = {
        "providers/sambanova",
        "providers/custom_llm_server",
        "providers/petals",
        "providers/snowflake"
      ],
    },
    {
@ -255,17 +260,23 @@ const sidebars = {
        "completion/batching",
        "completion/mock_requests",
        "completion/reliable_completions",
        'tutorials/litellm_proxy_aporia',
      ]
    },
    {
      type: "category",
      label: "Supported Endpoints",
      link: {
        type: "generated-index",
        title: "Supported Endpoints",
        description:
          "Learn how to deploy + call models from different providers on LiteLLM",
        slug: "/supported_endpoints",
      },
      items: [
        {
          type: "category",
-          label: "Chat",
+          label: "/chat/completions",
          link: {
            type: "generated-index",
            title: "Chat Completions",
@ -278,11 +289,13 @@ const sidebars = {
            "completion/usage",
          ],
        },
        "response_api",
        "text_completion",
        "embedding/supported_embedding",
        "anthropic_unified",
        {
          type: "category",
-          label: "Image",
+          label: "/images",
          items: [
            "image_generation",
            "image_variations",
@ -290,7 +303,7 @@ const sidebars = {
        },
        {
          type: "category",
-          label: "Audio",
+          label: "/audio",
          "items": [
            "audio_transcription",
            "text_to_speech",
@ -349,23 +362,6 @@ const sidebars = {
          label: "LangChain, LlamaIndex, Instructor Integration",
          items: ["langchain/langchain", "tutorials/instructor"],
        },
        {
          type: "category",
          label: "Tutorials",
          items: [
            'tutorials/azure_openai',
            'tutorials/instructor',
            "tutorials/gradio_integration",
            "tutorials/huggingface_codellama",
            "tutorials/huggingface_tutorial",
            "tutorials/TogetherAI_liteLLM",
            "tutorials/finetuned_chat_gpt",
            "tutorials/text_completion",
            "tutorials/first_playground",
            "tutorials/model_fallbacks",
          ],
        },
      ],
    },
    {
@ -382,13 +378,6 @@ const sidebars = {
        "load_test_rpm",
      ]
    },
    {
      type: "category",
      label: "Adding Providers",
      items: [
        "adding_provider/directory_structure",
        "adding_provider/new_rerank_provider"],
    },
    {
      type: "category",
      label: "Logging & Observability",
@ -423,12 +412,51 @@ const sidebars = {
        "observability/opik_integration",
      ],
    },
    {
      type: "category",
      label: "Tutorials",
      items: [
        "tutorials/openweb_ui",
        'tutorials/litellm_proxy_aporia',
        {
          type: "category",
          label: "LiteLLM Python SDK Tutorials",
          items: [
            'tutorials/azure_openai',
            'tutorials/instructor',
            "tutorials/gradio_integration",
            "tutorials/huggingface_codellama",
            "tutorials/huggingface_tutorial",
            "tutorials/TogetherAI_liteLLM",
            "tutorials/finetuned_chat_gpt",
            "tutorials/text_completion",
            "tutorials/first_playground",
            "tutorials/model_fallbacks",
          ],
        },
      ]
    },
    {
      type: "category",
      label: "Contributing",
      items: [
        "extras/contributing_code",
        {
          type: "category",
          label: "Adding Providers",
          items: [
            "adding_provider/directory_structure",
            "adding_provider/new_rerank_provider"],
        },
        "extras/contributing",
        "contributing",
      ]
    },
    {
      type: "category",
      label: "Extras",
      items: [
        "extras/contributing",
        "data_security",
        "data_retention",
        "migration_policy",
@ -445,6 +473,7 @@ const sidebars = {
          items: [
            "projects/smolagents",
            "projects/Docq.AI",
            "projects/PDL",
            "projects/OpenInterpreter",
            "projects/Elroy",
            "projects/dbally",
@ -460,9 +489,9 @@ const sidebars = {
            "projects/YiVal",
            "projects/LiteLLM Proxy",
            "projects/llm_cord",
            "projects/pgai",
          ],
        },
        "contributing",
        "proxy/pii_masking",
        "extras/code_quality",
        "rules",
--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -163,7 +163,7 @@ class AporiaGuardrail(CustomGuardrail):
        pass
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -173,6 +173,7 @@ class AporiaGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -94,6 +94,7 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -107,6 +107,7 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -126,6 +126,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -31,7 +31,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
    #### CALL HOOKS - proxy only ####
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -41,6 +41,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        text = ""
--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,12 +8,14 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
 from litellm.caching.llm_caching_handler import LLMClientCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm.types.utils import (
    ImageObject,
    BudgetConfig,
    all_litellm_params,
    all_litellm_params as _litellm_completion_params,
    CredentialItem,
 )  # maintain backwards compatibility for root param
 from litellm._logging import (
    set_verbose,
@ -53,6 +55,7 @@ from litellm.constants import (
    cohere_embedding_models,
    bedrock_embedding_models,
    known_tokenizer_config,
    BEDROCK_INVOKE_PROVIDERS_LITERAL,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@ -181,6 +184,7 @@ cloudflare_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
 snowflake_key: Optional[str] = None
 common_cloud_provider_auth_params: dict = {
    "params": ["project", "region_name", "token"],
    "providers": ["vertex_ai", "bedrock", "watsonx", "azure", "vertex_ai_beta"],
@ -190,15 +194,17 @@ ssl_verify: Union[str, bool] = True
 ssl_certificate: Optional[str] = None
 disable_streaming_logging: bool = False
 disable_add_transform_inline_image_block: bool = False
-in_memory_llm_clients_cache: InMemoryCache = InMemoryCache()
+in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
 safe_memory_mode: bool = False
 enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
-AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
+AZURE_DEFAULT_API_VERSION = "2025-02-01-preview"  # this is updated to the latest
 ### DEFAULT WATSONX API VERSION ###
 WATSONX_DEFAULT_API_VERSION = "2024-03-13"
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
 COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
 ### CREDENTIALS ###
 credential_list: List[CredentialItem] = []
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
@ -278,8 +284,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@ -363,17 +367,7 @@ BEDROCK_CONVERSE_MODELS = [
    "meta.llama3-2-11b-instruct-v1:0",
    "meta.llama3-2-90b-instruct-v1:0",
 ]
-BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
+
    "cohere",
    "anthropic",
    "mistral",
    "amazon",
    "meta",
    "llama",
    "ai21",
    "nova",
    "deepseek_r1",
 ]
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models: List = []
 open_ai_text_completion_models: List = []
@ -425,6 +419,7 @@ cerebras_models: List = []
 galadriel_models: List = []
 sambanova_models: List = []
 assemblyai_models: List = []
 snowflake_models: List = []
 def is_bedrock_pricing_only_model(key: str) -> bool:
@ -578,6 +573,8 @@ def add_known_models():
            assemblyai_models.append(key)
        elif value.get("litellm_provider") == "jina_ai":
            jina_ai_models.append(key)
        elif value.get("litellm_provider") == "snowflake":
            snowflake_models.append(key)
 add_known_models()
@ -607,6 +604,7 @@ ollama_models = ["llama2"]
 maritalk_models = ["maritalk"]
 model_list = (
    open_ai_chat_completion_models
    + open_ai_text_completion_models
@ -651,6 +649,7 @@ model_list = (
    + azure_text_models
    + assemblyai_models
    + jina_ai_models
    + snowflake_models
 )
 model_list_set = set(model_list)
@ -706,6 +705,7 @@ models_by_provider: dict = {
    "sambanova": sambanova_models,
    "assemblyai": assemblyai_models,
    "jina_ai": jina_ai_models,
    "snowflake": snowflake_models,
 }
 # mapping for those models which have larger equivalents
@ -811,9 +811,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
 from .llms.anthropic.experimental_pass_through.transformation import (
    AnthropicExperimentalPassThroughConfig,
 )
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -825,6 +822,7 @@ from .llms.databricks.embed.transformation import DatabricksEmbeddingConfig
 from .llms.predibase.chat.transformation import PredibaseConfig
 from .llms.replicate.chat.transformation import ReplicateConfig
 from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig
 from .llms.snowflake.chat.transformation import SnowflakeConfig
 from .llms.cohere.rerank.transformation import CohereRerankConfig
 from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config
 from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
@ -832,6 +830,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
 from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
 from .llms.anthropic.experimental_pass_through.messages.transformation import (
    AnthropicMessagesConfig,
 )
 from .llms.together_ai.chat import TogetherAIConfig
 from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
 from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -912,6 +913,7 @@ from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import
 from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
 from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
 from .llms.bedrock.image.amazon_nova_canvas_transformation import AmazonNovaCanvasConfig
 from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
 from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
    AmazonTitanMultimodalEmbeddingG1Config,
@ -934,11 +936,14 @@ from .llms.groq.chat.transformation import GroqChatConfig
 from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from .llms.openai.chat.o_series_transformation import (
    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
    OpenAIOSeriesConfig,
 )
 from .llms.snowflake.chat.transformation import SnowflakeConfig
 openaiOSeriesConfig = OpenAIOSeriesConfig()
 from .llms.openai.chat.gpt_transformation import (
    OpenAIGPTConfig,
@ -1022,6 +1027,8 @@ from .assistants.main import *
 from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
 from .llms.anthropic.experimental_pass_through.messages.handler import *
 from .responses.main import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -182,9 +182,7 @@ def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:
                "REDIS_CLUSTER_NODES environment variable is not valid JSON. Please ensure it's properly formatted."
            )
-    verbose_logger.debug(
+    verbose_logger.debug("init_redis_cluster: startup nodes are being initialized.")
        "init_redis_cluster: startup nodes are being initialized."
    )
    from redis.cluster import ClusterNode
    args = _get_redis_cluster_kwargs()
@ -307,7 +305,6 @@ def get_redis_async_client(
        return _init_async_redis_sentinel(redis_kwargs)
    return async_redis.Redis(
        socket_timeout=5,
        **redis_kwargs,
    )
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -1,186 +0,0 @@
 # What is this?
 ## Translates OpenAI call to Anthropic `/v1/messages` format
 import traceback
 from typing import Any, Optional
 import litellm
 from litellm import ChatCompletionRequest, verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
 from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
 class AnthropicAdapter(CustomLogger):
    def __init__(self) -> None:
        super().__init__()
    def translate_completion_input_params(
        self, kwargs
    ) -> Optional[ChatCompletionRequest]:
        """
        - translate params, where needed
        - pass rest, as is
        """
        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
            anthropic_message_request=request_body
        )
        return translated_body
    def translate_completion_output_params(
        self, response: ModelResponse
    ) -> Optional[AnthropicResponse]:
        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
            response=response
        )
    def translate_completion_output_params_streaming(
        self, completion_stream: Any
    ) -> AdapterCompletionStreamWrapper | None:
        return AnthropicStreamWrapper(completion_stream=completion_stream)
 anthropic_adapter = AnthropicAdapter()
 class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
    """
    - first chunk return 'message_start'
    - content block must be started and stopped
    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
    """
    sent_first_chunk: bool = False
    sent_content_block_start: bool = False
    sent_content_block_finish: bool = False
    sent_last_message: bool = False
    holding_chunk: Optional[Any] = None
    def __next__(self):
        try:
            if self.sent_first_chunk is False:
                self.sent_first_chunk = True
                return {
                    "type": "message_start",
                    "message": {
                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
                        "type": "message",
                        "role": "assistant",
                        "content": [],
                        "model": "claude-3-5-sonnet-20240620",
                        "stop_reason": None,
                        "stop_sequence": None,
                        "usage": {"input_tokens": 25, "output_tokens": 1},
                    },
                }
            if self.sent_content_block_start is False:
                self.sent_content_block_start = True
                return {
                    "type": "content_block_start",
                    "index": 0,
                    "content_block": {"type": "text", "text": ""},
                }
            for chunk in self.completion_stream:
                if chunk == "None" or chunk is None:
                    raise Exception
                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                    response=chunk
                )
                if (
                    processed_chunk["type"] == "message_delta"
                    and self.sent_content_block_finish is False
                ):
                    self.holding_chunk = processed_chunk
                    self.sent_content_block_finish = True
                    return {
                        "type": "content_block_stop",
                        "index": 0,
                    }
                elif self.holding_chunk is not None:
                    return_chunk = self.holding_chunk
                    self.holding_chunk = processed_chunk
                    return return_chunk
                else:
                    return processed_chunk
            if self.holding_chunk is not None:
                return_chunk = self.holding_chunk
                self.holding_chunk = None
                return return_chunk
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except StopIteration:
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except Exception as e:
            verbose_logger.error(
                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
            )
    async def __anext__(self):
        try:
            if self.sent_first_chunk is False:
                self.sent_first_chunk = True
                return {
                    "type": "message_start",
                    "message": {
                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
                        "type": "message",
                        "role": "assistant",
                        "content": [],
                        "model": "claude-3-5-sonnet-20240620",
                        "stop_reason": None,
                        "stop_sequence": None,
                        "usage": {"input_tokens": 25, "output_tokens": 1},
                    },
                }
            if self.sent_content_block_start is False:
                self.sent_content_block_start = True
                return {
                    "type": "content_block_start",
                    "index": 0,
                    "content_block": {"type": "text", "text": ""},
                }
            async for chunk in self.completion_stream:
                if chunk == "None" or chunk is None:
                    raise Exception
                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
                    response=chunk
                )
                if (
                    processed_chunk["type"] == "message_delta"
                    and self.sent_content_block_finish is False
                ):
                    self.holding_chunk = processed_chunk
                    self.sent_content_block_finish = True
                    return {
                        "type": "content_block_stop",
                        "index": 0,
                    }
                elif self.holding_chunk is not None:
                    return_chunk = self.holding_chunk
                    self.holding_chunk = processed_chunk
                    return return_chunk
                else:
                    return processed_chunk
            if self.holding_chunk is not None:
                return_chunk = self.holding_chunk
                self.holding_chunk = None
                return return_chunk
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopIteration
        except StopIteration:
            if self.sent_last_message is False:
                self.sent_last_message = True
                return {"type": "message_stop"}
            raise StopAsyncIteration
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -15,6 +15,7 @@ import litellm
 from litellm.types.router import GenericLiteLLMParams
 from litellm.utils import (
    exception_type,
    get_litellm_params,
    get_llm_provider,
    get_secret,
    supports_httpx_timeout,
@ -86,6 +87,7 @@ def get_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -169,6 +171,7 @@ def get_assistants(
            max_retries=optional_params.max_retries,
            client=client,
            aget_assistants=aget_assistants,  # type: ignore
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -270,6 +273,7 @@ def create_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -371,6 +375,7 @@ def create_assistants(
            client=client,
            async_create_assistants=async_create_assistants,
            create_assistant_data=create_assistant_data,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -445,6 +450,8 @@ def delete_assistant(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    async_delete_assistants: Optional[bool] = kwargs.pop(
        "async_delete_assistants", None
    )
@ -544,6 +551,7 @@ def delete_assistant(
            max_retries=optional_params.max_retries,
            client=client,
            async_delete_assistants=async_delete_assistants,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -639,6 +647,7 @@ def create_thread(
    """
    acreate_thread = kwargs.get("acreate_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -731,6 +740,7 @@ def create_thread(
            max_retries=optional_params.max_retries,
            client=client,
            acreate_thread=acreate_thread,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -795,7 +805,7 @@ def get_thread(
    """Get the thread object, given a thread_id"""
    aget_thread = kwargs.pop("aget_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
-
+    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
    # set timeout for 10 minutes by default
@ -884,6 +894,7 @@ def get_thread(
            max_retries=optional_params.max_retries,
            client=client,
            aget_thread=aget_thread,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -972,6 +983,7 @@ def add_message(
    _message_data = MessageData(
        role=role, content=content, attachments=attachments, metadata=metadata
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    optional_params = GenericLiteLLMParams(**kwargs)
    message_data = get_optional_params_add_message(
@ -1068,6 +1080,7 @@ def add_message(
            max_retries=optional_params.max_retries,
            client=client,
            a_add_message=a_add_message,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1139,6 +1152,7 @@ def get_messages(
 ) -> SyncCursorPage[OpenAIMessage]:
    aget_messages = kwargs.pop("aget_messages", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1225,6 +1239,7 @@ def get_messages(
            max_retries=optional_params.max_retries,
            client=client,
            aget_messages=aget_messages,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1337,6 +1352,7 @@ def run_thread(
    """Run a given thread + assistant."""
    arun_thread = kwargs.pop("arun_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1437,6 +1453,7 @@ def run_thread(
            max_retries=optional_params.max_retries,
            client=client,
            arun_thread=arun_thread,
            litellm_params=litellm_params_dict,
        )  # type: ignore
    else:
        raise litellm.exceptions.BadRequestError(
--- a/litellm/batches/batch_utils.py
+++ b/litellm/batches/batch_utils.py
@ -1,76 +1,16 @@
 import asyncio
 import datetime
 import json
-import threading
+from typing import Any, List, Literal, Tuple
 from typing import Any, List, Literal, Optional
 import litellm
 from litellm._logging import verbose_logger
 from litellm.constants import (
    BATCH_STATUS_POLL_INTERVAL_SECONDS,
    BATCH_STATUS_POLL_MAX_ATTEMPTS,
 )
 from litellm.files.main import afile_content
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.llms.openai import Batch
-from litellm.types.utils import StandardLoggingPayload, Usage
+from litellm.types.utils import CallTypes, Usage
 async def batches_async_logging(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
    logging_obj: Optional[LiteLLMLoggingObj] = None,
    **kwargs,
 ):
    """
    Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
    Polls retrieve_batch until it returns a batch with status "completed" or "failed"
    """
    from .main import aretrieve_batch
    verbose_logger.debug(
        ".....in _batches_async_logging... polling retrieve to get batch status"
    )
    if logging_obj is None:
        raise ValueError(
            "logging_obj is None cannot calculate cost / log batch creation event"
        )
    for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
        try:
            start_time = datetime.datetime.now()
            batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
            verbose_logger.debug(
                "in _batches_async_logging... batch status= %s", batch.status
            )
            if batch.status == "completed":
                end_time = datetime.datetime.now()
                await _handle_completed_batch(
                    batch=batch,
                    custom_llm_provider=custom_llm_provider,
                    logging_obj=logging_obj,
                    start_time=start_time,
                    end_time=end_time,
                    **kwargs,
                )
                break
            elif batch.status == "failed":
                pass
        except Exception as e:
            verbose_logger.error("error in batches_async_logging", e)
        await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
 async def _handle_completed_batch(
    batch: Batch,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
-    logging_obj: LiteLLMLoggingObj,
+) -> Tuple[float, Usage, List[str]]:
    start_time: datetime.datetime,
    end_time: datetime.datetime,
    **kwargs,
 ) -> None:
    """Helper function to process a completed batch and handle logging"""
    # Get batch results
    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,49 +27,25 @@ async def _handle_completed_batch(
        custom_llm_provider=custom_llm_provider,
    )
-    # Handle logging
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
-    await _log_completed_batch(
+
-        logging_obj=logging_obj,
+    return batch_cost, batch_usage, batch_models
        batch_usage=batch_usage,
        batch_cost=batch_cost,
        start_time=start_time,
        end_time=end_time,
        **kwargs,
    )
-async def _log_completed_batch(
+def _get_batch_models_from_file_content(
-    logging_obj: LiteLLMLoggingObj,
+    file_content_dictionary: List[dict],
-    batch_usage: Usage,
+) -> List[str]:
-    batch_cost: float,
+    """
-    start_time: datetime.datetime,
+    Get the models from the file content
-    end_time: datetime.datetime,
+    """
-    **kwargs,
+    batch_models = []
-) -> None:
+    for _item in file_content_dictionary:
-    """Helper function to handle all logging operations for a completed batch"""
+        if _batch_response_was_successful(_item):
-    logging_obj.call_type = "batch_success"
+            _response_body = _get_response_from_batch_job_output_file(_item)
-
+            _model = _response_body.get("model")
-    standard_logging_object = _create_standard_logging_object_for_completed_batch(
+            if _model:
-        kwargs=kwargs,
+                batch_models.append(_model)
-        start_time=start_time,
+    return batch_models
        end_time=end_time,
        logging_obj=logging_obj,
        batch_usage_object=batch_usage,
        response_cost=batch_cost,
    )
    logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
    # Launch async and sync logging handlers
    asyncio.create_task(
        logging_obj.async_success_handler(
            result=None,
            start_time=start_time,
            end_time=end_time,
            cache_hit=None,
        )
    )
    logging_obj.success_handler(None, start_time, end_time)
 async def _batch_cost_calculator(
@ -156,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
    """
    Get the batch output file content as a list of dictionaries
    """
    from litellm.files.main import afile_content
    if custom_llm_provider == "vertex_ai":
        raise ValueError("Vertex AI does not support file content retrieval")
@ -205,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
                total_cost += litellm.completion_cost(
                    completion_response=_response_body,
                    custom_llm_provider=custom_llm_provider,
                    call_type=CallTypes.aretrieve_batch.value,
                )
                verbose_logger.debug("total_cost=%s", total_cost)
        return total_cost
@ -261,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
    """
    _response: dict = batch_job_output_file.get("response", None) or {}
    return _response.get("status_code", None) == 200
 def _create_standard_logging_object_for_completed_batch(
    kwargs: dict,
    start_time: datetime.datetime,
    end_time: datetime.datetime,
    logging_obj: LiteLLMLoggingObj,
    batch_usage_object: Usage,
    response_cost: float,
 ) -> StandardLoggingPayload:
    """
    Create a standard logging object for a completed batch
    """
    standard_logging_object = logging_obj.model_call_details.get(
        "standard_logging_object", None
    )
    if standard_logging_object is None:
        raise ValueError("unable to create standard logging object for completed batch")
    # Add Completed Batch Job Usage and Response Cost
    standard_logging_object["call_type"] = "batch_success"
    standard_logging_object["response_cost"] = response_cost
    standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
    standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
    standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
    return standard_logging_object
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
    RetrieveBatchRequest,
 )
 from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import LiteLLMBatch
 from litellm.utils import client, get_litellm_params, supports_httpx_timeout
 from .batch_utils import batches_async_logging
 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
 azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
        else:
            response = init_response
        # Start async logging job
        if response is not None:
            asyncio.create_task(
                batches_async_logging(
                    logging_obj=kwargs.get("litellm_logging_obj", None),
                    batch_id=response.id,
                    custom_llm_provider=custom_llm_provider,
                    **kwargs,
                )
            )
        return response
    except Exception as e:
        raise e
@ -111,7 +99,7 @@ def create_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Creates and executes a batch from an uploaded file of request
@ -119,21 +107,27 @@ def create_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_call_id = kwargs.get("litellm_call_id", None)
        proxy_server_request = kwargs.get("proxy_server_request", None)
        model_info = kwargs.get("model_info", None)
        _is_async = kwargs.pop("acreate_batch", False) is True
        litellm_params = get_litellm_params(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            litellm_call_id=kwargs.get("litellm_call_id", None),
            litellm_trace_id=kwargs.get("litellm_trace_id"),
            litellm_metadata=kwargs.get("litellm_metadata"),
        )
        litellm_logging_obj.update_environment_variables(
            model=None,
            user=None,
            optional_params=optional_params.model_dump(),
-            litellm_params=litellm_params,
+            litellm_params={
                "litellm_call_id": litellm_call_id,
                "proxy_server_request": proxy_server_request,
                "model_info": model_info,
                "metadata": metadata,
                "preset_cache_key": None,
                "stream_response": {},
                **optional_params.model_dump(exclude_unset=True),
            },
            custom_llm_provider=custom_llm_provider,
        )
@ -224,6 +218,7 @@ def create_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_batch_data=_create_batch_request,
                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -261,7 +256,7 @@ def create_batch(
                response=httpx.Response(
                    status_code=400,
                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
                ),
            )
        return response
@ -269,6 +264,7 @@ def create_batch(
        raise e
@client
 async def aretrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +272,7 @@ async def aretrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Batch:
+) -> LiteLLMBatch:
    """
    Async: Retrieves a batch.
@ -310,6 +306,7 @@ async def aretrieve_batch(
        raise e
@client
 def retrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +314,7 @@ def retrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Retrieves a batch.
@ -325,9 +322,20 @@ def retrieve_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        # set timeout for 10 minutes by default
+        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            **kwargs,
        )
        litellm_logging_obj.update_environment_variables(
            model=None,
            user=None,
            optional_params=optional_params.model_dump(),
            litellm_params=litellm_params,
            custom_llm_provider=custom_llm_provider,
        )
        if (
            timeout is not None
@ -415,6 +423,7 @@ def retrieve_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                retrieve_batch_data=_retrieve_batch_request,
                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -517,6 +526,10 @@ def list_batches(
    try:
        # set API KEY
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            **kwargs,
        )
        api_key = (
            optional_params.api_key
            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
@ -594,6 +607,7 @@ def list_batches(
                api_version=api_version,
                timeout=timeout,
                max_retries=optional_params.max_retries,
                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
@ -669,6 +683,10 @@ def cancel_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            **kwargs,
        )
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        # set timeout for 10 minutes by default
@ -756,6 +774,7 @@ def cancel_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                cancel_batch_data=_cancel_batch_request,
                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -13,26 +13,14 @@ import json
 import time
 import traceback
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Union
 from openai.types.audio.transcription_create_params import TranscriptionCreateParams
 from openai.types.chat.completion_create_params import (
    CompletionCreateParamsNonStreaming,
    CompletionCreateParamsStreaming,
 )
 from openai.types.completion_create_params import (
    CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
 )
 from openai.types.completion_create_params import (
    CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
 )
 from openai.types.embedding_create_params import EmbeddingCreateParams
 from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
 from litellm.types.rerank import RerankRequest
 from litellm.types.utils import all_litellm_params
 from .base_cache import BaseCache
@ -257,7 +245,7 @@ class Cache:
            verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
            return preset_cache_key
-        combined_kwargs = self._get_relevant_args_to_use_for_cache_key()
+        combined_kwargs = ModelParamHelper._get_all_llm_api_params()
        litellm_param_kwargs = all_litellm_params
        for param in kwargs:
            if param in combined_kwargs:
@ -364,76 +352,6 @@ class Cache:
            if "litellm_params" in kwargs:
                kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key
    def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
        """
        Gets the supported kwargs for each call type and combines them
        """
        chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
        text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
        embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
        transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
        rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
        exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
        combined_kwargs = chat_completion_kwargs.union(
            text_completion_kwargs,
            embedding_kwargs,
            transcription_kwargs,
            rerank_kwargs,
        )
        combined_kwargs = combined_kwargs.difference(exclude_kwargs)
        return combined_kwargs
    def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
        """
        Get the litellm supported chat completion kwargs
        This follows the OpenAI API Spec
        """
        all_chat_completion_kwargs = set(
            CompletionCreateParamsNonStreaming.__annotations__.keys()
        ).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
        return all_chat_completion_kwargs
    def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
        """
        Get the litellm supported text completion kwargs
        This follows the OpenAI API Spec
        """
        all_text_completion_kwargs = set(
            TextCompletionCreateParamsNonStreaming.__annotations__.keys()
        ).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
        return all_text_completion_kwargs
    def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
        """
        Get the litellm supported rerank kwargs
        """
        return set(RerankRequest.model_fields.keys())
    def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
        """
        Get the litellm supported embedding kwargs
        This follows the OpenAI API Spec
        """
        return set(EmbeddingCreateParams.__annotations__.keys())
    def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
        """
        Get the litellm supported transcription kwargs
        This follows the OpenAI API Spec
        """
        return set(TranscriptionCreateParams.__annotations__.keys())
    def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
        """
        Get the kwargs to exclude from the cache key
        """
        return set(["metadata"])
    @staticmethod
    def _get_hashed_cache_key(cache_key: str) -> str:
        """
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -247,7 +247,6 @@ class LLMCachingHandler:
                    pass
                else:
                    call_type = original_function.__name__
                    cached_result = self._convert_cached_result_to_model_response(
                        cached_result=cached_result,
                        call_type=call_type,
@ -719,6 +718,7 @@ class LLMCachingHandler:
        """
        Sync internal method to add the result to the cache
        """
        new_kwargs = kwargs.copy()
        new_kwargs.update(
            convert_args_to_kwargs(
@ -732,6 +732,7 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
            litellm.cache.add_cache(result, **new_kwargs)
        return
@ -783,6 +784,7 @@ class LLMCachingHandler:
        - Else append the chunk to self.async_streaming_chunks
        """
        complete_streaming_response: Optional[
            Union[ModelResponse, TextCompletionResponse]
        ] = _assemble_complete_response_from_streaming_chunks(
@ -793,7 +795,6 @@ class LLMCachingHandler:
            streaming_chunks=self.async_streaming_chunks,
            is_async=True,
        )
        # if a complete_streaming_response is assembled, add it to the cache
        if complete_streaming_response is not None:
            await self.async_set_cache(
--- a/litellm/caching/llm_caching_handler.py
+++ b/litellm/caching/llm_caching_handler.py
@ -0,0 +1,40 @@
 """
 Add the event loop to the cache key, to prevent event loop closed errors.
 """
 import asyncio
 from .in_memory_cache import InMemoryCache
 class LLMClientCache(InMemoryCache):
    def update_cache_key_with_event_loop(self, key):
        """
        Add the event loop to the cache key, to prevent event loop closed errors.
        If none, use the key as is.
        """
        try:
            event_loop = asyncio.get_event_loop()
            stringified_event_loop = str(id(event_loop))
            return f"{key}-{stringified_event_loop}"
        except Exception:  # handle no current event loop
            return key
    def set_cache(self, key, value, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return super().set_cache(key, value, **kwargs)
    async def async_set_cache(self, key, value, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return await super().async_set_cache(key, value, **kwargs)
    def get_cache(self, key, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return super().get_cache(key, **kwargs)
    async def async_get_cache(self, key, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return await super().async_get_cache(key, **kwargs)
--- a/litellm/caching/redis_cache.py
+++ b/litellm/caching/redis_cache.py
@ -54,6 +54,7 @@ class RedisCache(BaseCache):
        redis_flush_size: Optional[int] = 100,
        namespace: Optional[str] = None,
        startup_nodes: Optional[List] = None,  # for redis-cluster
        socket_timeout: Optional[float] = 5.0,  # default 5 second timeout
        **kwargs,
    ):
@ -70,6 +71,9 @@ class RedisCache(BaseCache):
            redis_kwargs["password"] = password
        if startup_nodes is not None:
            redis_kwargs["startup_nodes"] = startup_nodes
        if socket_timeout is not None:
            redis_kwargs["socket_timeout"] = socket_timeout
        ### HEALTH MONITORING OBJECT ###
        if kwargs.get("service_logger_obj", None) is not None and isinstance(
            kwargs["service_logger_obj"], ServiceLogging
@ -543,6 +547,7 @@ class RedisCache(BaseCache):
        _redis_client: Redis = self.init_async_client()  # type: ignore
        start_time = time.time()
        _used_ttl = self.get_ttl(ttl=ttl)
        key = self.check_and_fix_namespace(key=key)
        try:
            result = await _redis_client.incrbyfloat(name=key, amount=value)
            if _used_ttl is not None:
@ -555,6 +560,7 @@ class RedisCache(BaseCache):
            ## LOGGING ##
            end_time = time.time()
            _duration = end_time - start_time
            asyncio.create_task(
                self.service_logger_obj.async_service_success_hook(
                    service=ServiceTypes.REDIS,
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Literal
 ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
@ -18,6 +18,7 @@ SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
 LITELLM_CHAT_PROVIDERS = [
    "openai",
@ -320,6 +321,17 @@ baseten_models: List = [
    "31dxrj3",
 ]  # FALCON 7B  # WizardLM  # Mosaic ML
 BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
    "cohere",
    "anthropic",
    "mistral",
    "amazon",
    "meta",
    "llama",
    "ai21",
    "nova",
    "deepseek_r1",
 ]
 open_ai_embedding_models: List = ["text-embedding-ada-002"]
 cohere_embedding_models: List = [
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -44,7 +44,12 @@ from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_ro
 from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
-from litellm.types.llms.openai import HttpxBinaryResponseContent
+from litellm.responses.utils import ResponseAPILoggingUtils
 from litellm.types.llms.openai import (
    HttpxBinaryResponseContent,
    ResponseAPIUsage,
    ResponsesAPIResponse,
 )
 from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
@ -239,6 +244,15 @@ def cost_per_token(  # noqa: PLR0915
            custom_llm_provider=custom_llm_provider,
            billed_units=rerank_billed_units,
        )
    elif (
        call_type == "aretrieve_batch"
        or call_type == "retrieve_batch"
        or call_type == CallTypes.aretrieve_batch
        or call_type == CallTypes.retrieve_batch
    ):
        return batch_cost_calculator(
            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
            model=model,
@ -399,9 +413,12 @@ def _select_model_name_for_cost_calc(
    if base_model is not None:
        return_model = base_model
-    completion_response_model: Optional[str] = getattr(
+    completion_response_model: Optional[str] = None
-        completion_response, "model", None
+    if completion_response is not None:
-    )
+        if isinstance(completion_response, BaseModel):
            completion_response_model = getattr(completion_response, "model", None)
        elif isinstance(completion_response, dict):
            completion_response_model = completion_response.get("model", None)
    hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
    if completion_response_model is None and hidden_params is not None:
        if (
@ -452,6 +469,13 @@ def _get_usage_object(
    return usage_obj
 def _is_known_usage_objects(usage_obj):
    """Returns True if the usage obj is a known Usage type"""
    return isinstance(usage_obj, litellm.Usage) or isinstance(
        usage_obj, ResponseAPIUsage
    )
 def _infer_call_type(
    call_type: Optional[CallTypesLiteral], completion_response: Any
 ) -> Optional[CallTypesLiteral]:
@ -561,9 +585,7 @@ def completion_cost(  # noqa: PLR0915
            base_model=base_model,
        )
-        verbose_logger.debug(
+        verbose_logger.info(f"selected model name for cost calculation: {model}")
            f"completion_response _select_model_name_for_cost_calc: {model}"
        )
        if completion_response is not None and (
            isinstance(completion_response, BaseModel)
@ -575,8 +597,8 @@ def completion_cost(  # noqa: PLR0915
                )
            else:
                usage_obj = getattr(completion_response, "usage", {})
-            if isinstance(usage_obj, BaseModel) and not isinstance(
+            if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
-                usage_obj, litellm.Usage
+                usage_obj=usage_obj
            ):
                setattr(
                    completion_response,
@ -589,6 +611,14 @@ def completion_cost(  # noqa: PLR0915
                _usage = usage_obj.model_dump()
            else:
                _usage = usage_obj
            if ResponseAPILoggingUtils._is_response_api_usage(_usage):
                _usage = (
                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
                        _usage
                    ).model_dump()
                )
            # get input/output tokens from completion_response
            prompt_tokens = _usage.get("prompt_tokens", 0)
            completion_tokens = _usage.get("completion_tokens", 0)
@ -778,6 +808,23 @@ def completion_cost(  # noqa: PLR0915
        raise e
 def get_response_cost_from_hidden_params(
    hidden_params: Union[dict, BaseModel]
 ) -> Optional[float]:
    if isinstance(hidden_params, BaseModel):
        _hidden_params_dict = hidden_params.model_dump()
    else:
        _hidden_params_dict = hidden_params
    additional_headers = _hidden_params_dict.get("additional_headers", {})
    if additional_headers and "x-litellm-response-cost" in additional_headers:
        response_cost = additional_headers["x-litellm-response-cost"]
        if response_cost is None:
            return None
        return float(additional_headers["x-litellm-response-cost"])
    return None
 def response_cost_calculator(
    response_object: Union[
        ModelResponse,
@ -787,6 +834,7 @@ def response_cost_calculator(
        TextCompletionResponse,
        HttpxBinaryResponseContent,
        RerankResponse,
        ResponsesAPIResponse,
    ],
    model: str,
    custom_llm_provider: Optional[str],
@ -813,7 +861,7 @@ def response_cost_calculator(
    base_model: Optional[str] = None,
    custom_pricing: Optional[bool] = None,
    prompt: str = "",
-) -> Optional[float]:
+) -> float:
    """
    Returns
    - float or None: cost of response
@ -825,6 +873,14 @@ def response_cost_calculator(
        else:
            if isinstance(response_object, BaseModel):
                response_object._hidden_params["optional_params"] = optional_params
                if hasattr(response_object, "_hidden_params"):
                    provider_response_cost = get_response_cost_from_hidden_params(
                        response_object._hidden_params
                    )
                    if provider_response_cost is not None:
                        return provider_response_cost
            response_cost = completion_cost(
                completion_response=response_object,
                model=model,
@ -957,3 +1013,54 @@ def default_image_cost_calculator(
            )
    return cost_info["input_cost_per_pixel"] * height * width * n
 def batch_cost_calculator(
    usage: Usage,
    model: str,
    custom_llm_provider: Optional[str] = None,
 ) -> Tuple[float, float]:
    """
    Calculate the cost of a batch job
    """
    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
        model=model, custom_llm_provider=custom_llm_provider
    )
    verbose_logger.info(
        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
        model,
        custom_llm_provider,
    )
    try:
        model_info: Optional[ModelInfo] = litellm.get_model_info(
            model=model, custom_llm_provider=custom_llm_provider
        )
    except Exception:
        model_info = None
    if not model_info:
        return 0.0, 0.0
    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
    input_cost_per_token = model_info.get("input_cost_per_token")
    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
    output_cost_per_token = model_info.get("output_cost_per_token")
    total_prompt_cost = 0.0
    total_completion_cost = 0.0
    if input_cost_per_token_batches:
        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
    elif input_cost_per_token:
        total_prompt_cost = (
            usage.prompt_tokens * (input_cost_per_token) / 2
        )  # batch cost is usually half of the regular token cost
    if output_cost_per_token_batches:
        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
    elif output_cost_per_token:
        total_completion_cost = (
            usage.completion_tokens * (output_cost_per_token) / 2
        )  # batch cost is usually half of the regular token cost
    return total_prompt_cost, total_completion_cost
--- a/Show more
+++ b/Show more