Merge branch 'main' into sync-logging

2025-04-26 03:04:13 +00:00 · 2025-03-20 01:48:22 +09:00 · 2025-03-20 01:48:22 +09:00 · a3fc795927
commit a3fc795927
parent 5d8b359384 edc38280ea
484 changed files with 27932 additions and 7615 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -49,7 +49,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -71,7 +71,7 @@ jobs:
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
            pip install "pytest-xdist==3.6.1"
-            pip install "websockets==10.4"
+            pip install "websockets==13.1.0"
            pip uninstall posthog -y
      - save_cache:
          paths:
@ -168,7 +168,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -189,6 +189,7 @@ jobs:
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -267,7 +268,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -288,6 +289,7 @@ jobs:
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
            pip install "jsonschema==4.22.0"
+            pip install "websockets==13.1.0"
      - save_cache:
          paths:
            - ./venv
@ -511,7 +513,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -678,6 +680,48 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
+  llm_responses_api_testing:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+
+    steps:
+      - checkout
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml llm_responses_api_coverage.xml
+            mv .coverage llm_responses_api_coverage
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - llm_responses_api_coverage.xml
+            - llm_responses_api_coverage
  litellm_mapped_tests:
    docker:
      - image: cimg/python:3.11
@ -1234,7 +1278,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Install Grype
          command: |
@ -1309,13 +1353,13 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m

      # Store test results
      - store_test_results:
          path: test-results
-  e2e_openai_misc_endpoints:
+  e2e_openai_endpoints:
    machine:
      image: ubuntu-2204:2023.10.1
    resource_class: xlarge
@ -1370,7 +1414,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1432,7 +1476,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/openai_misc_endpoints_tests --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -s -vv tests/openai_endpoints_tests --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m

      # Store test results
@ -1492,7 +1536,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1921,7 +1965,7 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            pip install "assemblyai==0.37.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
@ -1935,12 +1979,12 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install "anthropic==0.21.3"
+            pip install "anthropic==0.49.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -2068,7 +2112,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
+            coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -2197,7 +2241,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
@ -2387,7 +2431,7 @@ workflows:
              only:
                - main
                - /litellm_.*/
-      - e2e_openai_misc_endpoints:
+      - e2e_openai_endpoints:
          filters:
            branches:
              only:
@ -2429,6 +2473,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - llm_responses_api_testing:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - litellm_mapped_tests:
          filters:
            branches:
@ -2468,6 +2518,7 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2522,10 +2573,11 @@ workflows:
          requires:
            - local_testing
            - build_and_test
-            - e2e_openai_misc_endpoints
+            - e2e_openai_endpoints
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,5 +1,5 @@
 # used by CI/CD testing
-openai==1.54.0 
+openai==1.66.1
 python-dotenv
 tiktoken
 importlib_metadata
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -6,6 +6,16 @@

 <!-- e.g. "Fixes #000" -->

+## Pre-Submission checklist
+
+**Please complete all items before asking a LiteLLM maintainer to review your PR**
+
+- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
+- [ ] I have added a screenshot of my new test passing locally 
+- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
+- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
+
+
 ## Type

 <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@

 ## Changes

-<!-- List of changes -->
-
-## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
-If UI changes, send a screenshot/GIF of working UI fixes
-
-<!-- Test procedure -->

--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -80,7 +80,6 @@ jobs:
    permissions:
      contents: read
      packages: write
-      #
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
@ -112,7 +111,11 @@ jobs:
        with:
          context: .
          push: true
-          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
+          tags: |
+            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
          
@ -151,7 +154,11 @@ jobs:
          context: .
          file: ./docker/Dockerfile.database
          push: true
-          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} 
+          tags: |
+            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta-database.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
            
@ -190,7 +197,11 @@ jobs:
          context: .
          file: ./docker/Dockerfile.non_root
          push: true
-          tags: ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} 
+          tags: |
+            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }}
          labels: ${{ steps.meta-non_root.outputs.labels }} 
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  
@ -229,7 +240,11 @@ jobs:
          context: .
          file: ./litellm-js/spend-logs/Dockerfile
          push: true
-          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
+          tags: |
+            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
+            ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
+            ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8

  build-and-push-helm-chart:
--- a/.github/workflows/helm_unit_test.yml
+++ b/.github/workflows/helm_unit_test.yml
@ -0,0 +1,27 @@
+name: Helm unit test
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  unit-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Helm 3.11.1
+        uses: azure/setup-helm@v1
+        with:
+          version: '3.11.1'
+
+      - name: Install Helm Unit Test Plugin
+        run: |
+          helm plugin install https://github.com/helm-unittest/helm-unittest --version v0.4.4
+
+      - name: Run unit tests
+        run:
+          helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/32
+++ b/32
@ -0,0 +1,32 @@
+# LiteLLM Makefile
+# Simple Makefile for running tests and basic development tasks
+
+.PHONY: help test test-unit test-integration lint format
+
+# Default target
+help:
+	@echo "Available commands:"
+	@echo "  make test               - Run all tests"
+	@echo "  make test-unit          - Run unit tests"
+	@echo "  make test-integration   - Run integration tests"
+	@echo "  make test-unit-helm     - Run helm unit tests"
+
+install-dev:
+	poetry install --with dev
+
+lint: install-dev
+	poetry run pip install types-requests types-setuptools types-redis types-PyYAML
+	cd litellm && poetry run mypy . --ignore-missing-imports
+
+# Testing
+test:
+	poetry run pytest tests/
+
+test-unit:
+	poetry run pytest tests/litellm/
+
+test-integration:
+	poetry run pytest tests/ -k "not litellm"
+
+test-unit-helm:
+	helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/README.md
+++ b/README.md
@ -40,7 +40,7 @@ LiteLLM manages:
 [**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)

-🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 
+🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)

 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

@ -340,71 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \

 ## Contributing

-To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
-
-Here's how to modify the repo locally:
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Install dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-Step 3: Test your change:
-
-a. Add a pytest test within `tests/litellm/`
-
-This folder follows the same directory structure as `litellm/`.
-
-If a corresponding test file does not exist, create one.
-
-b. Run the test
-
-```
-cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
-pytest /path/to/test_file.py
-```
-
-Step 4: Submit a PR with your changes! 🚀
-
- push your fork to your GitHub repo
- submit a PR from there
-
-### Building LiteLLM Docker Image 
-
-Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Build the Docker Image
-
-Build using Dockerfile.non_root
-```
-docker build -f docker/Dockerfile.non_root -t litellm_test_image .
-```
-
-Step 3: Run the Docker Image
-
-Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
-```
-docker run \
-    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
-    -e DATABASE_URL="postgresql://xxxxxxxx" \
-    -e LITELLM_MASTER_KEY="sk-1234" \
-    -p 4000:4000 \
-    litellm_test_image \
-    --config /app/config.yaml --detailed_debug
-```
+Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)

 # Enterprise
 For companies that need better security, user management and professional support
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.4.1
+version: 0.4.2

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -22,6 +22,8 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | Name                                                       | Description                                                                                                                                                                           | Value |
 | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
 | `replicaCount`                                             | The number of LiteLLM Proxy pods to be deployed                                                                                                                                       | `1`  |
+| `masterkeySecretName`                                      | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use the generated secret name.                                                                                                         | N/A  |
+| `masterkeySecretKey`                                      | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM.  If not specified, use `masterkey` as the key.                                                                                                         | N/A  |
 | `masterkey`                                                | The Master API Key for LiteLLM.  If not specified, a random key is generated.                                                                                                         | N/A  |
 | `environmentSecrets`                                       | An optional array of Secret object names.  The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
 | `environmentConfigMaps`                                       | An optional array of ConfigMap object names.  The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -78,8 +78,8 @@ spec:
            - name: PROXY_MASTER_KEY
              valueFrom:
                secretKeyRef:
-                  name: {{ include "litellm.fullname" . }}-masterkey
-                  key: masterkey
+                  name: {{ .Values.masterkeySecretName | default (printf "%s-masterkey" (include "litellm.fullname" .)) }}
+                  key: {{ .Values.masterkeySecretKey | default "masterkey" }}
            {{- if .Values.redis.enabled }}
            - name: REDIS_HOST
              value: {{ include "litellm.redis.serviceName" . }}
--- a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
+++ b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
@ -1,3 +1,4 @@
+{{- if not .Values.masterkeySecretName }}
 {{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
 apiVersion: v1
 kind: Secret
@ -6,3 +7,4 @@ metadata:
 data:
  masterkey: {{ $masterkey | b64enc }}
 type: Opaque
+{{- end }}
--- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml
@ -0,0 +1,82 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+  - configmap-litellm.yaml
+tests:
+  - it: should work
+    template: deployment.yaml
+    set:
+      image.tag: test
+    asserts:
+      - isKind:
+          of: Deployment
+      - matchRegex:
+          path: metadata.name
+          pattern: -litellm$
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: ghcr.io/berriai/litellm-database:test
+  - it: should work with tolerations
+    template: deployment.yaml
+    set:
+      tolerations:
+        - key: node-role.kubernetes.io/master
+          operator: Exists
+          effect: NoSchedule
+    asserts:
+      - equal:
+          path: spec.template.spec.tolerations[0].key
+          value: node-role.kubernetes.io/master
+      - equal:
+          path: spec.template.spec.tolerations[0].operator
+          value: Exists
+  - it: should work with affinity
+    template: deployment.yaml
+    set:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: topology.kubernetes.io/zone
+                operator: In
+                values:
+                - antarctica-east1
+    asserts:
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key
+          value: topology.kubernetes.io/zone
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator
+          value: In
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]
+          value: antarctica-east1
+  - it: should work without masterkeySecretName or masterkeySecretKey
+    template: deployment.yaml
+    set:
+      masterkeySecretName: ""
+      masterkeySecretKey: ""
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: PROXY_MASTER_KEY
+            valueFrom:
+              secretKeyRef:
+                name: RELEASE-NAME-litellm-masterkey
+                key: masterkey
+  - it: should work with masterkeySecretName and masterkeySecretKey
+    template: deployment.yaml
+    set:
+      masterkeySecretName: my-secret
+      masterkeySecretKey: my-key
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: PROXY_MASTER_KEY
+            valueFrom:
+              secretKeyRef:
+                name: my-secret
+                key: my-key
--- a/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml
@ -0,0 +1,18 @@
+suite: test masterkey secret
+templates:
+  - secret-masterkey.yaml
+tests:
+  - it: should create a secret if masterkeySecretName is not set
+    template: secret-masterkey.yaml
+    set:
+      masterkeySecretName: ""
+    asserts:
+      - isKind:
+          of: Secret
+  - it: should not create a secret if masterkeySecretName is set
+    template: secret-masterkey.yaml
+    set:
+      masterkeySecretName: my-secret
+    asserts:
+      - hasDocuments:
+          count: 0
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -75,6 +75,12 @@ ingress:

 # masterkey: changeit

+# if set, use this secret for the master key; otherwise, autogenerate a new one
+masterkeySecretName: ""
+
+# if set, use this secret key for the master key; otherwise, use the default key
+masterkeySecretKey: ""
+
 # The elements within proxy_config are rendered as config.yaml for the proxy
 #  Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
 #  Reference: https://docs.litellm.ai/docs/proxy/configs
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,10 +20,18 @@ services:
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
+    depends_on:
+      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
+    healthcheck:  # Defines the health check configuration for the container
+      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
+      interval: 30s  # Perform health check every 30 seconds
+      timeout: 10s   # Health check command times out after 10 seconds
+      retries: 3     # Retry up to 3 times if health check fails
+      start_period: 40s  # Wait 40 seconds after container start before beginning health checks

 
  db:
-    image: postgres
+    image: postgres:16
    restart: always
    environment:
      POSTGRES_DB: litellm
@ -31,6 +39,8 @@ services:
      POSTGRES_PASSWORD: dbpassword9090
    ports:
      - "5432:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data  # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
@ -53,6 +63,8 @@ services:
 volumes:
  prometheus_data:
    driver: local
+  postgres_data:
+    name: litellm_postgres_data  # Named volume for Postgres data persistence


 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -0,0 +1,92 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# /v1/messages [BETA] 
+
+LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
+
+This currently just supports the Anthropic API. 
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ✅ |  |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | between anthropic models |
+| Loadbalancing | ✅ | between anthropic models |
+
+Planned improvement:
+- Vertex AI Anthropic support
+- Bedrock Anthropic support
+
+## Usage 
+
+<Tabs>
+<TabItem label="PROXY" value="proxy">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "List 5 important events in the XIX century"
+        }
+      ]
+    }
+  ],
+  "max_tokens": 4096
+}'
+```
+</TabItem>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
+import asyncio 
+import os 
+
+# set env 
+os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
+
+messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
+
+# Call the handler
+async def call(): 
+    response = await anthropic_messages(
+        messages=messages,
+        api_key=api_key,
+        model="claude-3-haiku-20240307",
+        max_tokens=100,
+    )
+
+asyncio.run(call())
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Assistants API 
+# /assistants

 Covers Threads, Messages, Assistants. 

--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [BETA] Batches API
+# /batches

 Covers Batches, Files

--- a/docs/my-website/docs/completion/prompt_caching.md
+++ b/docs/my-website/docs/completion/prompt_caching.md
@ -3,7 +3,13 @@ import TabItem from '@theme/TabItem';

 # Prompt Caching 

-For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
+Supported Providers:
+- OpenAI (`openai/`)
+- Anthropic API (`anthropic/`)
+- Bedrock (`bedrock/`, `bedrock/invoke/`, `bedrock/converse`) ([All models bedrock supports prompt caching on](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html))
+- Deepseek API (`deepseek/`)
+
+For the supported providers, LiteLLM follows the OpenAI prompt caching usage object format:

 ```bash
 "usage": {
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -190,3 +190,137 @@ Expected Response

 </TabItem>
 </Tabs>
+
+
+## Explicitly specify image type 
+
+If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
+
+```python
+"image_url": {
+  "url": "gs://my-gs-image",
+  "format": "image/jpeg"
+}
+```
+
+LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai). 
+
+For others (e.g. openai), it will be ignored. 
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+```python
+import os 
+from litellm import completion
+
+os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "claude-3-7-sonnet-latest", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                  "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+</TabItem>
+<TabItem label="PROXY" value="proxy">
+
+1. Define vision models on config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
+    litellm_params:
+      model: openai/gpt-4-vision-preview
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: llava-hf          # Custom OpenAI compatible model
+    litellm_params:
+      model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
+      api_base: http://localhost:8000
+      api_key: fake-key
+    model_info:
+      supports_vision: True        # set supports_vision to True so /model/info returns this attribute as True
+
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it using the OpenAI Python SDK
+
+
+```python
+import os 
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # your litellm proxy api key
+)
+
+response = client.chat.completions.create(
+    model = "gpt-4-vision-preview",  # use model="llava-hf" to test your custom OpenAI endpoint
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
+
+
+## Spec 
+
+```
+"image_url": str
+
+OR 
+
+"image_url": {
+  "url": "url OR base64 encoded str",
+  "detail": "openai-only param", 
+  "format": "specify mime-type of image"
+}
+```
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
 |-------------------|-------------------------------------------------------------------------------------------------|
 | SOC 2 Type I      | Certified. Report available upon request on Enterprise plan.                                                           |
 | SOC 2 Type II     | In progress. Certificate available by April 15th, 2025                   |
-| ISO27001          | In progress. Certificate available by February 7th, 2025                                           |
+| ISO 27001          | Certified. Report available upon request on Enterprise                              |


 ## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
 Has the Vendor been audited / certified? 
 - SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
 - SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025.
+- ISO 27001. Certified. Report available upon request on Enterprise plan.

 Has an information security management system been implemented? 
 - Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Embeddings
+# /embeddings

 ## Quick Start
 ```python
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -34,9 +34,9 @@ You can use our cloud product where we setup a dedicated instance for you.

 Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting.  We can’t solve your own infrastructure-related issues but we will guide you to fix them.

- 1 hour for Sev0 issues
- 6 hours for Sev1
- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday)
+- 1 hour for Sev0 issues - 100% production traffic is failing
+- 6 hours for Sev1 - <100% production traffic is failing
+- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) - setup issues e.g. Redis working on our end, but not on your infrastructure.
 - 72h SLA for patching vulnerabilities in the software. 

 **We can offer custom SLAs** based on your needs and the severity of the issue
--- a/docs/my-website/docs/extras/contributing_code.md
+++ b/docs/my-website/docs/extras/contributing_code.md
@ -0,0 +1,106 @@
+# Contributing Code
+
+## **Checklist before submitting a PR**
+
+Here are the core requirements for any PR submitted to LiteLLM
+
+
+- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
+- [ ] Ensure your PR passes the following tests:
+    - [ ] [Unit Tests](#3-running-unit-tests)
+    - [ ] [Formatting / Linting Tests](#35-running-linting-tests)
+- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
+
+
+
+## Quick start
+
+## 1. Setup your local dev environment
+
+
+Here's how to modify the repo locally:
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Install dev dependencies:
+
+```shell
+poetry install --with dev --extras proxy
+```
+
+That's it, your local dev environment is ready!
+
+## 2. Adding Testing to your PR
+
+- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
+
+- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
+- Do not add real llm api calls to this directory.
+
+### 2.1 File Naming Convention for `tests/litellm/`
+
+The `tests/litellm/` directory follows the same directory structure as `litellm/`.
+
+- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
+- `test_{filename}.py` maps to `litellm/{filename}.py`
+
+## 3. Running Unit Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make test-unit
+```
+
+## 3.5 Running Linting Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make lint
+```
+
+LiteLLM uses mypy for linting. On ci/cd we also run `black` for formatting.
+
+## 4. Submit a PR with your changes!
+
+- push your fork to your GitHub repo
+- submit a PR from there
+
+
+## Advanced
+### Building LiteLLM Docker Image 
+
+Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Build the Docker Image
+
+Build using Dockerfile.non_root
+
+```shell
+docker build -f docker/Dockerfile.non_root -t litellm_test_image .
+```
+
+Step 3: Run the Docker Image
+
+Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
+
+```shell
+docker run \
+    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
+    -e DATABASE_URL="postgresql://xxxxxxxx" \
+    -e LITELLM_MASTER_KEY="sk-1234" \
+    -p 4000:4000 \
+    litellm_test_image \
+    --config /app/config.yaml --detailed_debug
+```
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,7 +2,7 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';

-# Files API
+# /files

 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [Beta] Fine-tuning API
+# /fine_tuning


 :::info
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Moderation
+# /moderations


 ### Usage
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,6 +78,9 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
+* `user_feedback: Optional[str]` - The end user’s feedback.
+* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
 * `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.

 ## Using a self hosted deployment of Athina
--- a/docs/my-website/docs/projects/PDL.md
+++ b/docs/my-website/docs/projects/PDL.md
@ -0,0 +1,5 @@
+PDL - A YAML-based approach to prompt programming
+
+Github: https://github.com/IBM/prompt-declaration-language
+
+PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.
--- a/docs/my-website/docs/projects/pgai.md
+++ b/docs/my-website/docs/projects/pgai.md
@ -0,0 +1,9 @@
+# pgai
+
+[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
+
+If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
+
+If you're already familiar with pgai, you can find litellm specific docs here:
+- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
+- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
--- a/docs/my-website/docs/providers/infinity.md
+++ b/docs/my-website/docs/providers/infinity.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Infinity

 | Property | Details |
@ -12,6 +15,9 @@

 ```python
 from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"

 response = rerank(
    model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
 ```


+## Supported Cohere Rerank API Params
+
+| Param | Type | Description |
+|-------|-------|-------|
+| `query` | `str` | The query to rerank the documents against |
+| `documents` | `list[str]` | The documents to rerank |
+| `top_n` | `int` | The number of documents to return |
+| `return_documents` | `bool` | Whether to return the documents in the response |
+
+### Usage - Return Documents
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    return_documents=True,
+)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of France?",
+    "documents": [
+        "Paris",
+        "London",
+        "Berlin",
+        "Madrid"
+    ],
+    "return_documents": True,
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Pass Provider-specific Params
+
+Any unmapped params will be passed to the provider as-is.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import rerank
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
+
+response = rerank(
+    model="infinity/rerank",
+    query="What is the capital of France?",
+    documents=["Paris", "London", "Berlin", "Madrid"],
+    raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
+)
+```
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: custom-infinity-rerank
+    litellm_params:
+      model: infinity/rerank
+      api_base: https://localhost:8080
+      raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
+```
+
+2. Start litellm
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!  
+
+```bash
+curl http://0.0.0.0:4000/rerank \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-rerank",
+    "query": "What is the capital of the United States?",
+    "documents": [
+        "Carson City is the capital city of the American state of Nevada.",
+        "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
+        "Washington, D.C. is the capital of the United States.",
+        "Capital punishment has existed in the United States since before it was a country."
+    ],
+    "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
+  }'
+```
+</TabItem>
+
+</Tabs>
--- a/docs/my-website/docs/providers/snowflake.md
+++ b/docs/my-website/docs/providers/snowflake.md
@ -0,0 +1,90 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+
+# Snowflake
+| Property | Details |
+|-------|-------|
+| Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE function via HTTP POST requests|
+| Provider Route on LiteLLM | `snowflake/` |
+| Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) |
+| Base URL | [https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete/](https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete) |
+| Supported OpenAI Endpoints | `/chat/completions`, `/completions` |
+
+
+
+Currently, Snowflake's REST API does not have an endpoint for `snowflake-arctic-embed` embedding models. If you want to use these embedding models with Litellm, you can call them through our Hugging Face provider. 
+
+Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake/arctic-embed-661fd57d50fab5fc314e4c18) on Hugging Face.
+
+## Supported OpenAI Parameters
+```
+    "temperature",
+    "max_tokens",
+    "top_p",
+    "response_format"
+```
+
+## API KEYS
+
+Snowflake does have API keys. Instead, you access the Snowflake API with your JWT token and account identifier.
+
+```python
+import os 
+os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
+os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
+```
+## Usage
+
+```python
+from litellm import completion
+
+## set ENV variables
+os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
+os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
+
+# Snowflake call
+response = completion(
+    model="snowflake/mistral-7b", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+## Usage with LiteLLM Proxy 
+
+#### 1. Required env variables
+```bash
+export SNOWFLAKE_JWT=""
+export SNOWFLAKE_ACCOUNT_ID = ""
+```
+
+#### 2. Start the proxy~
+```yaml
+model_list:
+  - model_name: mistral-7b
+    litellm_params:
+        model: snowflake/mistral-7b
+        api_key: YOUR_API_KEY
+        api_base: https://YOUR-ACCOUNT-ID.snowflakecomputing.com/api/v2/cortex/inference:complete
+
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+#### 3. Test it
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "snowflake/mistral-7b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, how are you?"
+        }
+      ]
+    }
+'
+```
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -405,13 +405,15 @@ If this was your initial VertexAI Grounding code,

 ```python
 import vertexai
+from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
+

 vertexai.init(project=project_id, location="us-central1")

 model = GenerativeModel("gemini-1.5-flash-001")

 # Use Google Search for grounding
-tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
+tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
@ -852,6 +854,7 @@ litellm.vertex_location = "us-central1 # Your Location
 | claude-3-5-sonnet@20240620  | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
+| claude-3-7-sonnet@20250219   | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |

 ### Usage

@ -926,6 +929,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>


+
+### Usage - `thinking` / `reasoning_content`
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="vertex_ai/claude-3-7-sonnet-20250219",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet-20250219
+  litellm_params:
+    model: vertex_ai/claude-3-7-sonnet-20250219
+    vertex_ai_project: "my-test-project"
+    vertex_ai_location: "us-west-1"
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet-20250219",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+
+
 ## Llama 3 API
 
 | Model Name       | Function Call                        |
@ -1572,6 +1688,14 @@ assert isinstance(

 Pass any file supported by Vertex AI, through LiteLLM. 

+LiteLLM Supports the following image types passed in url
+
+```
+Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
+Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
+Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
+Base64 Encoded Local Images
+```

 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
 </TabItem>
 </Tabs>

+## Send Video URL to VLLM
+
+Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
+
+There are two ways to send a video url to VLLM:
+
+1. Pass the video url directly
+
+```
+{"type": "video_url", "video_url": {"url": video_url}},
+```
+
+2. Pass the video data as base64
+
+```
+{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+response = completion(
+            model="hosted_vllm/qwen", # pass the vllm model name
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Summarize the following video"
+                        },
+                        {
+                            "type": "video_url",
+                            "video_url": {
+                                "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+                            }
+                        }
+                    ]
+                }
+            ],
+            api_base="https://hosted-vllm-api.co")
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
+        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -X POST http://0.0.0.0:4000/chat/completions \
+-H "Authorization: Bearer sk-1234" \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "my-model",
+    "messages": [
+        {"role": "user", "content": 
+            [
+                {"type": "text", "text": "Summarize the following video"},
+                {"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
+            ]
+        }
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`

--- a/docs/my-website/docs/proxy/access_control.md
+++ b/docs/my-website/docs/proxy/access_control.md
@ -10,17 +10,13 @@ Role-based access control (RBAC) is based on Organizations, Teams and Internal U

 ## Roles

-**Admin Roles**
-  - `proxy_admin`: admin over the platform
-  - `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users
-
-**Organization Roles**
-  - `org_admin`: admin over the organization. Can create teams and users within their organization
-
-**Internal User Roles**
-  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
-  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
-
+| Role Type | Role Name | Permissions |
+|-----------|-----------|-------------|
+| **Admin** | `proxy_admin` | Admin over the platform |
+| | `proxy_admin_viewer` | Can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users |
+| **Organization** | `org_admin` | Admin over the organization. Can create teams and users within their organization |
+| **Internal User** | `internal_user` | Can login, view/create/delete their own keys, view their spend. **Cannot** add new users |
+| | `internal_user_viewer` | Can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users |

 ## Onboarding Organizations 

--- a/docs/my-website/docs/proxy/architecture.md
+++ b/docs/my-website/docs/proxy/architecture.md
@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
        - Virtual Key Rate Limit
        - User Rate Limit
        - Team Limit
-    - The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
+    - The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)

 ## Frequently Asked Questions

--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -499,6 +499,7 @@ router_settings:
 | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
 | SPEND_LOGS_URL | URL for retrieving spend logs
 | SSL_CERTIFICATE | Path to the SSL certificate file
+| SSL_SECURITY_LEVEL | [BETA] Security level for SSL/TLS connections. E.g. `DEFAULT@SECLEVEL=1`
 | SSL_VERIFY | Flag to enable or disable SSL certificate verification
 | SUPABASE_KEY | API key for Supabase service
 | SUPABASE_URL | Base URL for Supabase instance
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -448,6 +448,34 @@ model_list:

 s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for helping with this. 

+### Centralized Credential Management
+
+Define credentials once and reuse them across multiple models. This helps with:
+- Secret rotation
+- Reducing config duplication
+
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: azure/gpt-4o
+      litellm_credential_name: default_azure_credential  # Reference credential below
+
+credential_list:
+  - credential_name: default_azure_credential
+    credential_values:
+      api_key: os.environ/AZURE_API_KEY  # Load from environment
+      api_base: os.environ/AZURE_API_BASE
+      api_version: "2023-05-15"
+    credential_info:
+      description: "Production credentials for EU region"
+```
+
+#### Key Parameters
+- `credential_name`: Unique identifier for the credential set
+- `credential_values`: Key-value pairs of credentials/secrets (supports `os.environ/` syntax)
+- `credential_info`: Key-value pairs of user provided credentials information.  No key-value pairs are required, but the dictionary must exist.
+
 ### Load API Keys from Secret Managers (Azure Vault, etc)

 [**Using Secret Managers with LiteLLM Proxy**](../secret)
--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma

 | Table Name | Description | Row Insert Frequency |
 |------------|-------------|---------------------|
-| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
-| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
+| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |

-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## Disable `LiteLLM_SpendLogs`

 You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.

 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```

 ### What is the impact of disabling these logs?
--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -37,7 +37,7 @@ guardrails:
  - guardrail_name: aim-protected-app
    litellm_params:
      guardrail: aim
-      mode: pre_call # 'during_call' is also available
+      mode: [pre_call, post_call] # "During_call" is also available
      api_key: os.environ/AIM_API_KEY
      api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `api_base` | `Optional[str]` | Optional API base URL |
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
+| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |

 ## StandardLoggingModelInformation

--- a/docs/my-website/docs/proxy/master_key_rotations.md
+++ b/docs/my-website/docs/proxy/master_key_rotations.md
@ -0,0 +1,53 @@
+# Rotating Master Key
+
+Here are our recommended steps for rotating your master key.
+
+
+**1. Backup your DB**
+In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
+
+**2. Call `/key/regenerate` with the new master key**
+
+```bash
+curl -L -X POST 'http://localhost:4000/key/regenerate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{
+  "key": "sk-1234",
+  "new_master_key": "sk-PIp1h0RekR"
+}'
+```
+
+This will re-encrypt any models in your Proxy_ModelTable with the new master key.
+
+Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
+
+```bash
+   raise Exception("Unable to decrypt value={}".format(v))
+Exception: Unable to decrypt value=<new-encrypted-value>
+```
+
+**3. Update LITELLM_MASTER_KEY**
+
+In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
+
+This ensures the key used for decryption from db is the new key.
+
+**4. Test it**
+
+Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
+
+```bash
+ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
+    "messages": [
+        {
+            "content": "Hey, how's it going",
+            "role": "user"
+        }
+    ],
+}'
+```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -107,9 +107,9 @@ general_settings:

 By default, LiteLLM writes several types of logs to the database:
 - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+- LLM Exceptions to the `LiteLLM_SpendLogs` table

-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:

 ```yaml
 general_settings:
--- a/docs/my-website/docs/proxy/release_cycle.md
+++ b/docs/my-website/docs/proxy/release_cycle.md
@ -0,0 +1,12 @@
+# Release Cycle
+
+Litellm Proxy has the following release cycle:
+
+- `v1.x.x-nightly`: These are releases which pass ci/cd. 
+- `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
+- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
+
+In production, we recommend using the latest `v1.x.x` release.
+
+
+Follow our release notes [here](https://github.com/BerriAI/litellm/releases).
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -102,7 +102,19 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
 </TabItem>
 </Tabs>

-## Advanced - Set Accepted JWT Scope Names 
+## Advanced
+
+### Multiple OIDC providers
+
+Use this if you want LiteLLM to validate your JWT against multiple OIDC providers (e.g. Google Cloud, GitHub Auth)
+
+Set `JWT_PUBLIC_KEY_URL` in your environment to a comma-separated list of URLs for your OIDC providers.
+
+```bash
+export JWT_PUBLIC_KEY_URL="https://demo.duendesoftware.com/.well-known/openid-configuration/jwks,https://accounts.google.com/.well-known/openid-configuration/jwks"
+```
+
+### Set Accepted JWT Scope Names 

 Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.

@ -114,7 +126,7 @@ general_settings:
    admin_jwt_scope: "litellm-proxy-admin"
 ```

-## Tracking End-Users / Internal Users / Team / Org
+### Tracking End-Users / Internal Users / Team / Org

 Set the field in the jwt token, which corresponds to a litellm user / team / org.

@ -156,7 +168,7 @@ scope: ["litellm-proxy-admin",...]
 scope: "litellm-proxy-admin ..."
 ```

-## Control model access with Teams
+### Control model access with Teams


 1. Specify the JWT field that contains the team ids, that the user belongs to. 
@ -207,11 +219,11 @@ OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a
 - If all checks pass, allow the request


-## Advanced - Custom Validate
+### Custom JWT Validate

 Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.

-### 1. Setup custom validate function
+#### 1. Setup custom validate function

 ```python
 from typing import Literal
@ -230,7 +242,7 @@ def my_custom_validate(token: str) -> Literal[True]:
  return True
 ```

-### 2. Setup config.yaml
+#### 2. Setup config.yaml

 ```yaml
 general_settings:
@ -243,7 +255,7 @@ general_settings:
    custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
 ```

-### 3. Test the flow
+#### 3. Test the flow

 **Expected JWT**

@ -265,7 +277,7 @@ general_settings:



-## Advanced - Allowed Routes 
+### Allowed Routes 

 Configure which routes a JWT can access via the config.

@ -297,7 +309,7 @@ general_settings:
    team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
 ```

-## Advanced - Caching Public Keys 
+### Caching Public Keys 

 Control how long public keys are cached for (in seconds).

@ -311,7 +323,7 @@ general_settings:
    public_key_ttl: 600 # 👈 KEY CHANGE
 ```

-## Advanced - Custom JWT Field 
+### Custom JWT Field 

 Set a custom field in which the team_id exists. By default, the 'client_id' field is checked. 

@ -323,14 +335,7 @@ general_settings:
    team_id_jwt_field: "client_id" # 👈 KEY CHANGE
 ```

-## All Params
-
-[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
-
-
-
-
-## Advanced - Block Teams 
+### Block Teams 

 To block all requests for a certain team id, use `/team/block`

@ -357,7 +362,7 @@ curl --location 'http://0.0.0.0:4000/team/unblock' \
 ```


-## Advanced - Upsert Users + Allowed Email Domains 
+### Upsert Users + Allowed Email Domains 

 Allow users who belong to a specific email domain, automatic access to the proxy.
 
@ -495,3 +500,9 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
  ]
 }'
 ```
+
+## All JWT Params
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
+
+
--- a/docs/my-website/docs/proxy/ui_credentials.md
+++ b/docs/my-website/docs/proxy/ui_credentials.md
@ -0,0 +1,55 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Adding LLM Credentials
+
+You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
+
+## Add a credential + model
+
+### 1. Navigate to LLM Credentials page
+
+Go to Models -> LLM Credentials -> Add Credential
+
+<Image img={require('../../img/ui_cred_add.png')} />
+
+### 2. Add credentials
+
+Select your LLM provider, enter your API Key and click "Add Credential"
+
+**Note: Credentials are based on the provider, if you select Vertex AI then you will see `Vertex Project`, `Vertex Location` and `Vertex Credentials` fields**
+
+<Image img={require('../../img/ui_add_cred_2.png')} />
+
+
+### 3. Use credentials when adding a model
+
+Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
+
+<Image img={require('../../img/ui_cred_3.png')} />
+
+
+## Create a Credential from an existing model
+
+Use this if you have already created a model and want to store the model credentials for future use
+
+### 1. Select model to create a credential from
+
+Go to Models -> Select your model -> Credential -> Create Credential
+
+<Image img={require('../../img/ui_cred_4.png')} />
+
+### 2. Use new credential when adding a model
+
+Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
+
+<Image img={require('../../img/use_model_cred.png')} />
+
+## Frequently Asked Questions
+
+
+How are credentials stored?
+Credentials in the DB are encrypted/decrypted using `LITELLM_SALT_KEY`, if set. If not, then they are encrypted using `LITELLM_MASTER_KEY`. These keys should be kept secret and not shared with others.
+
+
--- a/docs/my-website/docs/proxy/ui_logs.md
+++ b/docs/my-website/docs/proxy/ui_logs.md
@ -0,0 +1,55 @@
+
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# UI Logs Page
+
+View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM
+
+
+<Image img={require('../../img/ui_request_logs.png')}/>
+
+
+## Overview
+
+| Log Type | Tracked by Default |
+|----------|-------------------|
+| Success Logs | ✅ Yes |
+| Error Logs | ✅ Yes |
+| Request/Response Content Stored | ❌ No by Default, **opt in with `store_prompts_in_spend_logs`** |
+
+
+
+**By default LiteLLM does not track the request and response content.**
+
+## Tracking - Request / Response Content in Logs Page 
+
+If you want to view request and response content on LiteLLM Logs, you need to opt in with this setting
+
+```yaml
+general_settings:
+  store_prompts_in_spend_logs: true
+```
+
+<Image img={require('../../img/ui_request_logs_content.png')}/>
+
+
+## Stop storing Error Logs in DB
+
+If you do not want to store error logs in DB, you can opt out with this setting
+
+```yaml
+general_settings:
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
+```
+
+## Stop storing Spend Logs in DB
+
+If you do not want to store spend logs in DB, you can opt out with this setting
+
+```yaml
+general_settings:
+  disable_spend_logs: True   # Disable writing spend logs to DB
+```
+
--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Realtime Endpoints
+# /realtime

 Use this to loadbalance across Azure + OpenAI. 

--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -3,11 +3,20 @@ import TabItem from '@theme/TabItem';

 # 'Thinking' / 'Reasoning Content'

+:::info
+
+Requires LiteLLM v1.63.0+
+
+:::
+
 Supported Providers:
 - Deepseek (`deepseek/`)
 - Anthropic API (`anthropic/`)
- Bedrock (Anthropic) (`bedrock/`)
+- Bedrock (Anthropic + Deepseek) (`bedrock/`)
 - Vertex AI (Anthropic) (`vertexai/`)
+- OpenRouter (`openrouter/`)
+
+LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.

 ```python
 "message": {
@ -17,7 +26,7 @@ Supported Providers:
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
        }
    ]
 }
@ -95,13 +104,263 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 }
 ```

+## Tool Calling with `thinking`
+
+Here's how to use `thinking` blocks by Anthropic with tool calling.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+litellm._turn_on_debug()
+litellm.modify_params = True
+model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
+    }
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model=model,
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+    thinking={"type": "enabled", "budget_tokens": 1024},
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+print("Expecting there to be 3 tool calls")
+assert (
+    len(tool_calls) > 0
+)  # this has to call the function for SF, Tokyo and paris
+
+# Step 2: check if the model wanted to call a function
+print(f"tool_calls: {tool_calls}")
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }  # only one function in this example, but you can have multiple
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        if function_name not in available_functions:
+            # the model called a function that does not exist in available_functions - don't try calling anything
+            return
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model=model,
+        messages=messages,
+        seed=22,
+        # tools=tools,
+        drop_params=True,
+        thinking={"type": "enabled", "budget_tokens": 1024},
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: claude-3-7-sonnet-thinking
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-20250219
+      api_key: os.environ/ANTHROPIC_API_KEY
+      thinking: {
+        "type": "enabled",
+        "budget_tokens": 1024
+      }
+```
+
+2. Run proxy
+
+```bash
+litellm --config config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Make 1st call
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
+    ],
+    "tools": [
+        {
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather in a given location",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state",
+                      },
+                      "unit": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                      },
+                  },
+                  "required": ["location"],
+              },
+          },
+        }
+    ],
+    "tool_choice": "auto"
+  }'
+```
+
+4. Make 2nd call with tool call results
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "claude-3-7-sonnet-thinking",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
+      },
+      {
+        "role": "assistant",
+        "content": "I\'ll check the current weather for these three cities for you:",
+        "tool_calls": [
+          {
+            "index": 2,
+            "function": {
+              "arguments": "{\"location\": \"San Francisco\"}",
+              "name": "get_current_weather"
+            },
+            "id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+            "type": "function"
+          }
+        ],
+        "function_call": null,
+        "reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+        "thinking_blocks": [
+          {
+            "type": "thinking",
+            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
+            "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
+          }
+        ],
+        "provider_specific_fields": {
+          "reasoningContentBlocks": [
+            {
+              "reasoningText": {
+                "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
+                "text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
+              }
+            }
+          ]
+        }
+      },
+      {
+        "tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
+        "role": "tool",
+        "name": "get_current_weather",
+        "content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
+      }
+    ]
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+## Switching between Anthropic + Deepseek models 
+
+Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
+
+```python
+litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
+
+# or per request
+## Anthropic
+response = litellm.completion(
+  model="anthropic/claude-3-7-sonnet-20250219",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+
+## Deepseek
+response = litellm.completion(
+  model="deepseek/deepseek-chat",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  drop_params=True,
+)
+```
+
 ## Spec 

+
 These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.

 - `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
 - `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
  - `type` - str: The type of thinking block.
  - `thinking` - str: The thinking from the model.
-  - `signature_delta` - str: The signature delta from the model.
+  - `signature` - str: The signature delta from the model.

--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -1,4 +1,4 @@
-# Rerank
+# /rerank

 :::tip

--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -0,0 +1,117 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# /responses [Beta]
+
+LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](https://platform.openai.com/docs/api-reference/responses)
+
+| Feature | Supported | Notes |
+|---------|-----------|--------|
+| Cost Tracking | ✅ | Works with all supported models |
+| Logging | ✅ | Works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | Works between supported models |
+| Loadbalancing | ✅ | Works between supported models |
+| Supported LiteLLM Versions | 1.63.8+ | |
+| Supported LLM providers | `openai` | |
+
+## Usage
+
+## Create a model response
+
+<Tabs>
+<TabItem value="litellm-sdk" label="LiteLLM SDK">
+
+#### Non-streaming
+```python
+import litellm
+
+# Non-streaming response
+response = litellm.responses(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python
+import litellm
+
+# Streaming response
+response = litellm.responses(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
+
+First, add this to your litellm proxy config.yaml:
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+Start your LiteLLM proxy:
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+Then use the OpenAI SDK pointed to your proxy:
+
+#### Non-streaming
+```python
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -830,7 +830,7 @@ asyncio.run(router_acompletion())

 Set `weight` on a deployment to pick one deployment more often than others. 

-This works across **ALL** routing strategies. 
+This works across **simple-shuffle** routing strategy (this is the default, if no routing strategy is selected). 

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -952,8 +952,8 @@ router_settings:
 ```

 Defaults:
- allowed_fails: 0
- cooldown_time: 60s 
+- allowed_fails: 3
+- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)

 **Set Per Model**

--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
 ```


+#### Using K/V pairs in 1 AWS Secret
+
+You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
+
+```yaml
+general_settings:
+  key_management_system: "aws_secret_manager"
+  key_management_settings:
+    hosted_keys: [
+      "OPENAI_API_KEY_MODEL_1",
+      "OPENAI_API_KEY_MODEL_2",
+    ]
+    primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
+```
+
+The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
+
+```json
+{
+  "OPENAI_API_KEY_MODEL_1": "sk-key1...",
+  "OPENAI_API_KEY_MODEL_2": "sk-key2..."
+}
+```
+
+This reduces the number of AWS Secrets you need to manage.
+
+
 ## Hashicorp Vault


@ -353,4 +380,7 @@ general_settings:
    
    # Hosted Keys Settings
    hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
+
+    # K/V pairs in 1 AWS Secret Settings
+    primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
 ```
--- a/docs/my-website/docs/text_completion.md
+++ b/docs/my-website/docs/text_completion.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Text Completion
+# /completions

 ### Usage
 <Tabs>
--- a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
+++ b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Use LiteLLM AI Gateway with Aporia Guardrails
+# Aporia Guardrails with LiteLLM Gateway

-In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
+In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses

 ## 1. Setup guardrails on Aporia

--- a/docs/my-website/docs/tutorials/openweb_ui.md
+++ b/docs/my-website/docs/tutorials/openweb_ui.md
@ -0,0 +1,103 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# OpenWeb UI with LiteLLM
+
+This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to 
+- Access 100+ LLMs on OpenWeb UI
+- Track Spend / Usage, Set Budget Limits 
+- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
+- Set access controls eg. Control what models OpenWebUI can access.
+
+## Quickstart
+
+- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
+
+
+## 1. Start LiteLLM & OpenWebUI
+
+- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
+- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
+
+
+## 2. Create a Virtual Key on LiteLLM
+
+Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
+
+### 2.1 LiteLLM User Management Hierarchy
+
+On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
+
+- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
+- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
+- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
+- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
+
+Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
+
+### 2.2 Create a Team on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
+
+<Image img={require('../../img/litellm_create_team.gif')} />
+
+### 2.2 Create a Virtual Key on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key. 
+
+LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
+
+<Image img={require('../../img/create_key_in_team_oweb.gif')} />
+
+## 3. Connect OpenWeb UI to LiteLLM
+
+On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
+
+Enter the following details:
+- URL: `http://localhost:4000` (your litellm proxy base url)
+- Key: `your-virtual-key` (the key you created in the previous step)
+
+<Image img={require('../../img/litellm_setup_openweb.gif')} />
+
+### 3.1 Test Request
+
+On the top left corner, select models you should only see the models you gave the key access to in Step 2.
+
+Once you selected a model, enter your message content and click on `Submit`
+
+<Image img={require('../../img/basic_litellm.gif')} />
+
+### 3.2 Tracking Spend / Usage
+
+After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
+
+<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
+
+
+
+## Render `thinking` content on OpenWeb UI
+
+OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
+
+Example litellm config.yaml:
+
+```yaml
+model_list:
+  - model_name: thinking-anthropic-claude-3-7-sonnet
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      thinking: {"type": "enabled", "budget_tokens": 1024}
+      max_tokens: 1080
+      merge_reasoning_content_in_choices: true
+```
+
+### Test it on OpenWeb UI
+
+On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
+
+<Image img={require('../../img/litellm_thinking_openweb.gif')} />
+
+
+
+
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -44,7 +44,7 @@ const config = {
        path: './release_notes',
        routeBasePath: 'release_notes',
        blogTitle: 'Release Notes',
-        blogSidebarTitle: 'All Releases',
+        blogSidebarTitle: 'Releases',
        blogSidebarCount: 'ALL',
        postsPerPage: 'ALL',
        showReadingTime: false,
--- a/docs/my-website/img/basic_litellm.gif
+++ b/docs/my-website/img/basic_litellm.gif
--- a/docs/my-website/img/create_key_in_team_oweb.gif
+++ b/docs/my-website/img/create_key_in_team_oweb.gif
--- a/docs/my-website/img/litellm_create_team.gif
+++ b/docs/my-website/img/litellm_create_team.gif
--- a/docs/my-website/img/litellm_setup_openweb.gif
+++ b/docs/my-website/img/litellm_setup_openweb.gif
--- a/docs/my-website/img/litellm_thinking_openweb.gif
+++ b/docs/my-website/img/litellm_thinking_openweb.gif
--- a/docs/my-website/img/release_notes/anthropic_thinking.jpg
+++ b/docs/my-website/img/release_notes/anthropic_thinking.jpg
--- a/docs/my-website/img/release_notes/credentials.jpg
+++ b/docs/my-website/img/release_notes/credentials.jpg
--- a/docs/my-website/img/release_notes/error_logs.jpg
+++ b/docs/my-website/img/release_notes/error_logs.jpg
--- a/docs/my-website/img/release_notes/litellm_test_connection.gif
+++ b/docs/my-website/img/release_notes/litellm_test_connection.gif
--- a/docs/my-website/img/release_notes/responses_api.png
+++ b/docs/my-website/img/release_notes/responses_api.png
--- a/docs/my-website/img/release_notes/v1632_release.jpg
+++ b/docs/my-website/img/release_notes/v1632_release.jpg
--- a/docs/my-website/img/ui_add_cred_2.png
+++ b/docs/my-website/img/ui_add_cred_2.png
--- a/docs/my-website/img/ui_cred_3.png
+++ b/docs/my-website/img/ui_cred_3.png
--- a/docs/my-website/img/ui_cred_4.png
+++ b/docs/my-website/img/ui_cred_4.png
--- a/docs/my-website/img/ui_cred_add.png
+++ b/docs/my-website/img/ui_cred_add.png
--- a/docs/my-website/img/ui_request_logs.png
+++ b/docs/my-website/img/ui_request_logs.png
--- a/docs/my-website/img/ui_request_logs_content.png
+++ b/docs/my-website/img/ui_request_logs_content.png
--- a/docs/my-website/img/use_model_cred.png
+++ b/docs/my-website/img/use_model_cred.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -706,12 +706,13 @@
      }
    },
    "node_modules/@babel/helpers": {
-      "version": "7.26.0",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz",
-      "integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz",
+      "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/template": "^7.25.9",
-        "@babel/types": "^7.26.0"
+        "@babel/template": "^7.26.9",
+        "@babel/types": "^7.26.10"
      },
      "engines": {
        "node": ">=6.9.0"
@ -796,11 +797,12 @@
      }
    },
    "node_modules/@babel/parser": {
-      "version": "7.26.3",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.3.tgz",
-      "integrity": "sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.10.tgz",
+      "integrity": "sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/types": "^7.26.3"
+        "@babel/types": "^7.26.10"
      },
      "bin": {
        "parser": "bin/babel-parser.js"
@ -2157,9 +2159,10 @@
      }
    },
    "node_modules/@babel/runtime-corejs3": {
-      "version": "7.26.0",
-      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.0.tgz",
-      "integrity": "sha512-YXHu5lN8kJCb1LOb9PgV6pvak43X2h4HvRApcN5SdWeaItQOzfn1hgP6jasD6KWQyJDBxrVmA9o9OivlnNJK/w==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz",
+      "integrity": "sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==",
+      "license": "MIT",
      "dependencies": {
        "core-js-pure": "^3.30.2",
        "regenerator-runtime": "^0.14.0"
@ -2169,13 +2172,14 @@
      }
    },
    "node_modules/@babel/template": {
-      "version": "7.25.9",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz",
-      "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==",
+      "version": "7.26.9",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.26.9.tgz",
+      "integrity": "sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/code-frame": "^7.25.9",
-        "@babel/parser": "^7.25.9",
-        "@babel/types": "^7.25.9"
+        "@babel/code-frame": "^7.26.2",
+        "@babel/parser": "^7.26.9",
+        "@babel/types": "^7.26.9"
      },
      "engines": {
        "node": ">=6.9.0"
@ -2199,9 +2203,10 @@
      }
    },
    "node_modules/@babel/types": {
-      "version": "7.26.3",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.3.tgz",
-      "integrity": "sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.10.tgz",
+      "integrity": "sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==",
+      "license": "MIT",
      "dependencies": {
        "@babel/helper-string-parser": "^7.25.9",
        "@babel/helper-validator-identifier": "^7.25.9"
--- a/docs/my-website/release_notes/v1.57.8-stable/index.md
+++ b/docs/my-website/release_notes/v1.57.8-stable/index.md
@ -18,13 +18,6 @@ hide_table_of_contents: false
 `alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`


-:::note
-
-v1.57.8-stable, is currently being tested. It will be released on 2025-01-12. 
-
-:::
-
-
 ## New / Updated Models

 1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452
--- a/docs/my-website/release_notes/v1.61.20-stable/index.md
+++ b/docs/my-website/release_notes/v1.61.20-stable/index.md
@ -0,0 +1,103 @@
+---
+title: v1.61.20-stable
+slug: v1.61.20-stable
+date: 2025-03-01T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+# v1.61.20-stable
+
+
+These are the changes since `v1.61.13-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
+- UI improvements (add model flow, user management, etc)
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+## New Models / Updated Models
+
+1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter) 
+    1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
+    2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+    3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
+    4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
+2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
+3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
+4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
+5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
+6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
+
+## LLM Translation
+
+1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
+2. Amazon Deepseek - `<think>` param extraction into ‘reasoning_content’ [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
+3. Amazon Titan Embeddings - filter out ‘aws_’ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
+4. Anthropic ‘thinking’ + ‘reasoning_content’ translation support (Anthropic API, Bedrock, Vertex AI)  [Start here](https://docs.litellm.ai/docs/reasoning_content)
+5. VLLM - support ‘video_url’ [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
+6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
+7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
+9. O1/O3 - support ‘drop_params’ for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
+
+## Spend Tracking Improvements
+
+1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
+2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
+3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
+
+## Management Endpoints / UI
+
+1. Models Page - Allow sorting models by ‘created at’
+2. Models Page - Edit Model Flow Improvements
+3. Models Page - Fix Adding Azure, Azure AI Studio models on UI 
+4. Internal Users Page - Allow Bulk Adding Internal Users on UI 
+5. Internal Users Page - Allow sorting users by ‘created at’ 
+6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
+7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
+8. Model Hub Page  - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
+9. Admin Settings Page - Allow adding MSFT SSO on UI 
+10. Backend - don't allow creating duplicate internal users in DB
+
+## Helm
+
+1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
+2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
+
+## Logging / Guardrail Integrations
+
+1. Arize Phoenix support 
+2. ‘No-log’ - fix ‘no-log’ param support on embedding calls 
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
+
+## General Proxy Improvements
+
+1. Hypercorn - fix reading / parsing request body 
+2. Windows - fix running proxy in windows 
+3. DD-Trace - fix dd-trace enablement on proxy
+
+## Complete Git Diff
+
+View the complete git diff [here](https://github.com/BerriAI/litellm/compare/v1.61.13-stable...v1.61.20-stable).
--- a/docs/my-website/release_notes/v1.63.0/index.md
+++ b/docs/my-website/release_notes/v1.63.0/index.md
@ -0,0 +1,40 @@
+---
+title: v1.63.0 - Anthropic 'thinking' response update
+slug: v1.63.0
+date: 2025-03-05T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
+
+
+
+It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
+
+
+## Diff 
+
+```bash
+"message": {
+    ...
+    "reasoning_content": "The capital of France is Paris.",
+    "thinking_blocks": [
+        {
+            "type": "thinking",
+            "thinking": "The capital of France is Paris.",
+-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
+        }
+    ]
+}
+```
--- a/docs/my-website/release_notes/v1.63.11-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.11-stable/index.md
@ -0,0 +1,180 @@
+---
+title: v1.63.11-stable
+slug: v1.63.11-stable
+date: 2025-03-15T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+
+tags: [credential management, thinking content, responses api, snowflake]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+These are the changes since `v1.63.2-stable`.
+
+This release is primarily focused on:
+- [Beta] Responses API Support
+- Snowflake Cortex Support, Amazon Nova Image Generation
+- UI - Credential Management, re-use credentials when adding new models
+- UI - Test Connection to LLM Provider before adding a model
+
+:::info
+
+This release will be live on 03/16/2025
+
+:::
+
+<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
+
+## Known Issues
+- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test
+
+
+## Docker Run LiteLLM Proxy
+
+```
+docker run
+-e STORE_MODEL_IN_DB=True
+-p 4000:4000
+ghcr.io/berriai/litellm:main-v1.63.11-stable
+```
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+
+
+## New Models / Updated Models
+
+- Image Generation support for Amazon Nova Canvas [Getting Started](https://docs.litellm.ai/docs/providers/bedrock#image-generation)
+- Add pricing for Jamba new models [PR](https://github.com/BerriAI/litellm/pull/9032/files)
+- Add pricing for Amazon EU models [PR](https://github.com/BerriAI/litellm/pull/9056/files)
+- Add Bedrock Deepseek R1 model pricing [PR](https://github.com/BerriAI/litellm/pull/9108/files)
+- Update Gemini pricing: Gemma 3, Flash 2 thinking update, LearnLM [PR](https://github.com/BerriAI/litellm/pull/9190/files)
+- Mark Cohere Embedding 3 models as Multimodal [PR](https://github.com/BerriAI/litellm/pull/9176/commits/c9a576ce4221fc6e50dc47cdf64ab62736c9da41)
+- Add Azure Data Zone pricing [PR](https://github.com/BerriAI/litellm/pull/9185/files#diff-19ad91c53996e178c1921cbacadf6f3bae20cfe062bd03ee6bfffb72f847ee37)
+   - LiteLLM Tracks cost for `azure/eu` and `azure/us` models
+
+
+
+## LLM Translation
+
+<Image img={require('../../img/release_notes/responses_api.png')} />
+
+1. **New Endpoints**
+- [Beta] POST `/responses` API. [Getting Started](https://docs.litellm.ai/docs/response_api)
+
+2. **New LLM Providers**
+- Snowflake Cortex [Getting Started](https://docs.litellm.ai/docs/providers/snowflake)
+
+3. **New LLM Features**
+
+- Support OpenRouter `reasoning_content` on streaming [Getting Started](https://docs.litellm.ai/docs/reasoning_content)
+
+4. **Bug Fixes**
+
+- OpenAI: Return `code`, `param` and `type` on bad request error [More information on litellm exceptions](https://docs.litellm.ai/docs/exception_mapping)
+- Bedrock: Fix converse chunk parsing to only return empty dict on tool use [PR](https://github.com/BerriAI/litellm/pull/9166)
+- Bedrock: Support extra_headers [PR](https://github.com/BerriAI/litellm/pull/9113)
+- Azure: Fix Function Calling Bug & Update Default API Version to `2025-02-01-preview` [PR](https://github.com/BerriAI/litellm/pull/9191)
+- Azure: Fix AI services URL [PR](https://github.com/BerriAI/litellm/pull/9185)
+- Vertex AI: Handle HTTP 201 status code in response [PR](https://github.com/BerriAI/litellm/pull/9193)
+- Perplexity: Fix incorrect streaming response [PR](https://github.com/BerriAI/litellm/pull/9081)
+- Triton: Fix streaming completions bug [PR](https://github.com/BerriAI/litellm/pull/8386)
+- Deepgram: Support bytes.IO when handling audio files for transcription [PR](https://github.com/BerriAI/litellm/pull/9071)
+- Ollama: Fix "system" role has become unacceptable [PR](https://github.com/BerriAI/litellm/pull/9261)
+- All Providers (Streaming): Fix String `data:` stripped from entire content in streamed responses [PR](https://github.com/BerriAI/litellm/pull/9070)
+
+
+
+## Spend Tracking Improvements
+
+1. Support Bedrock converse cache token tracking [Getting Started](https://docs.litellm.ai/docs/completion/prompt_caching)
+2. Cost Tracking for Responses API [Getting Started](https://docs.litellm.ai/docs/response_api)
+3. Fix Azure Whisper cost tracking [Getting Started](https://docs.litellm.ai/docs/audio_transcription)
+
+
+## UI
+
+### Re-Use Credentials on UI
+
+You can now onboard LLM provider credentials on LiteLLM UI. Once these credentials are added you can re-use them when adding new models [Getting Started](https://docs.litellm.ai/docs/proxy/ui_credentials)
+
+<Image img={require('../../img/release_notes/credentials.jpg')} />
+
+
+### Test Connections before adding models
+
+Before adding a model you can test the connection to the LLM provider to verify you have setup your API Base + API Key correctly
+
+<Image img={require('../../img/release_notes/litellm_test_connection.gif')} />
+
+### General UI Improvements
+1. Add Models Page
+   - Allow adding Cerebras, Sambanova, Perplexity, Fireworks, Openrouter, TogetherAI Models, Text-Completion OpenAI on Admin UI
+   - Allow adding EU OpenAI models
+   - Fix: Instantly show edit + deletes to models
+2. Keys Page
+   - Fix: Instantly show newly created keys on Admin UI (don't require refresh)
+   - Fix: Allow clicking into Top Keys when showing users Top API Key
+   - Fix: Allow Filter Keys by Team Alias, Key Alias and Org
+   - UI Improvements: Show 100 Keys Per Page, Use full height, increase width of key alias
+3. Users Page
+   - Fix: Show correct count of internal user keys on Users Page
+   - Fix: Metadata not updating in Team UI
+4. Logs Page
+   - UI Improvements: Keep expanded log in focus on LiteLLM UI
+   - UI Improvements: Minor improvements to logs page
+   - Fix: Allow internal user to query their own logs
+   - Allow switching off storing Error Logs in DB [Getting Started](https://docs.litellm.ai/docs/proxy/ui_logs)
+5. Sign In/Sign Out
+   - Fix: Correctly use `PROXY_LOGOUT_URL` when set [Getting Started](https://docs.litellm.ai/docs/proxy/self_serve#setting-custom-logout-urls)
+
+
+## Security
+
+1. Support for Rotating Master Keys [Getting Started](https://docs.litellm.ai/docs/proxy/master_key_rotations)
+2. Fix: Internal User Viewer Permissions, don't allow `internal_user_viewer` role to see `Test Key Page` or `Create Key Button` [More information on role based access controls](https://docs.litellm.ai/docs/proxy/access_control)
+3. Emit audit logs on All user + model Create/Update/Delete endpoints [Getting Started](https://docs.litellm.ai/docs/proxy/multiple_admins)
+4. JWT
+    - Support multiple JWT OIDC providers [Getting Started](https://docs.litellm.ai/docs/proxy/token_auth)
+    - Fix JWT access with Groups not working when team is assigned All Proxy Models access
+5. Using K/V pairs in 1 AWS Secret [Getting Started](https://docs.litellm.ai/docs/secret#using-kv-pairs-in-1-aws-secret)
+
+
+## Logging Integrations
+
+1. Prometheus: Track Azure LLM API latency metric [Getting Started](https://docs.litellm.ai/docs/proxy/prometheus#request-latency-metrics)
+2. Athina: Added tags, user_feedback and model_options to additional_keys which can be sent to Athina [Getting Started](https://docs.litellm.ai/docs/observability/athina_integration)
+
+
+## Performance / Reliability improvements
+
+1. Redis + litellm router - Fix Redis cluster mode for litellm router [PR](https://github.com/BerriAI/litellm/pull/9010)
+
+
+## General Improvements
+
+1. OpenWebUI Integration - display `thinking` tokens
+- Guide on getting started with LiteLLM x OpenWebUI. [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui)
+- Display `thinking` tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui#render-thinking-content-on-openweb-ui)
+
+<Image img={require('../../img/litellm_thinking_openweb.gif')} />
+
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.2-stable...v1.63.11-stable)
--- a/docs/my-website/release_notes/v1.63.2-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.2-stable/index.md
@ -0,0 +1,112 @@
+---
+title: v1.63.2-stable
+slug: v1.63.2-stable
+date: 2025-03-08T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+
+These are the changes since `v1.61.20-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (more `thinking` content improvements)
+- UI improvements (Error logs now shown on UI)
+
+
+:::info
+
+This release will be live on 03/09/2025
+
+::: 
+
+<Image img={require('../../img/release_notes/v1632_release.jpg')} />
+
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+
+## New Models / Updated Models
+
+1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
+2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
+3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
+
+## LLM Translation
+
+<Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
+
+1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
+3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
+6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
+7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
+8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
+9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+- Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
+11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
+12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
+- Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API. 
+13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
+
+## Spend Tracking Improvements
+
+1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
+2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
+
+## Management Endpoints / UI
+
+<Image img={require('../../img/release_notes/error_logs.jpg')} />
+
+1. Virtual Keys Page
+    - Allow team/org filters to be searchable on the Create Key Page
+    - Add created_by and updated_by fields to Keys table
+    - Show 'user_email' on key table
+    - Show 100 Keys Per Page, Use full height, increase width of key alias
+2. Logs Page
+    - Show Error Logs on LiteLLM UI
+    - Allow Internal Users to View their own logs
+3. Internal Users Page 
+    - Allow admin to control default model access for internal users
+7. Fix session handling with cookies
+
+## Logging / Guardrail Integrations
+
+1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
+2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
+
+## General Proxy Improvements
+
+1. Raise BadRequestError when unknown model passed in request
+2. Enforce model access restrictions on Azure OpenAI proxy route
+3. Reliability fix - Handle emoji’s in text - fix orjson error
+4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
+5. Enable setting timezone information in docker image 
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -41,10 +41,12 @@ const sidebars = {
            "proxy/deploy",
            "proxy/prod",
            "proxy/cli",
+            "proxy/release_cycle",
            "proxy/model_management",
            "proxy/health",
            "proxy/debugging",
            "proxy/spending_monitoring",
+            "proxy/master_key_rotations",
          ],
        },
        "proxy/demo",
@ -99,7 +101,9 @@ const sidebars = {
            "proxy/admin_ui_sso",
            "proxy/self_serve",
            "proxy/public_teams",
-            "proxy/custom_sso"
+            "proxy/custom_sso",
+            "proxy/ui_credentials",
+            "proxy/ui_logs"
          ],
        },
        {
@ -229,6 +233,7 @@ const sidebars = {
        "providers/sambanova",
        "providers/custom_llm_server",
        "providers/petals",
+        "providers/snowflake"
      ],
    },
    {
@ -255,17 +260,23 @@ const sidebars = {
        "completion/batching",
        "completion/mock_requests",
        "completion/reliable_completions",
-        'tutorials/litellm_proxy_aporia',

      ]
    },
    {
      type: "category",
      label: "Supported Endpoints",
+      link: {
+        type: "generated-index",
+        title: "Supported Endpoints",
+        description:
+          "Learn how to deploy + call models from different providers on LiteLLM",
+        slug: "/supported_endpoints",
+      },
      items: [
        {
          type: "category",
-          label: "Chat",
+          label: "/chat/completions",
          link: {
            type: "generated-index",
            title: "Chat Completions",
@ -278,11 +289,13 @@ const sidebars = {
            "completion/usage",
          ],
        },
+        "response_api",
        "text_completion",
        "embedding/supported_embedding",
+        "anthropic_unified",
        {
          type: "category",
-          label: "Image",
+          label: "/images",
          items: [
            "image_generation",
            "image_variations",
@ -290,7 +303,7 @@ const sidebars = {
        },
        {
          type: "category",
-          label: "Audio",
+          label: "/audio",
          "items": [
            "audio_transcription",
            "text_to_speech",
@ -349,23 +362,6 @@ const sidebars = {
          label: "LangChain, LlamaIndex, Instructor Integration",
          items: ["langchain/langchain", "tutorials/instructor"],
        },
-        {
-          type: "category",
-          label: "Tutorials",
-          items: [
-
-            'tutorials/azure_openai',
-            'tutorials/instructor',
-            "tutorials/gradio_integration",
-            "tutorials/huggingface_codellama",
-            "tutorials/huggingface_tutorial",
-            "tutorials/TogetherAI_liteLLM",
-            "tutorials/finetuned_chat_gpt",
-            "tutorials/text_completion",
-            "tutorials/first_playground",
-            "tutorials/model_fallbacks",
-          ],
-        },
      ],
    },
    {
@ -382,13 +378,6 @@ const sidebars = {
        "load_test_rpm",
      ]
    },
-    {
-      type: "category",
-      label: "Adding Providers",
-      items: [
-        "adding_provider/directory_structure",
-        "adding_provider/new_rerank_provider"],
-    },
    {
      type: "category",
      label: "Logging & Observability",
@ -423,12 +412,51 @@ const sidebars = {
        "observability/opik_integration",
      ],
    },
+    {
+      type: "category",
+      label: "Tutorials",
+      items: [
+        "tutorials/openweb_ui",
+        'tutorials/litellm_proxy_aporia',
+        {
+          type: "category",
+          label: "LiteLLM Python SDK Tutorials",
+          items: [

+            'tutorials/azure_openai',
+            'tutorials/instructor',
+            "tutorials/gradio_integration",
+            "tutorials/huggingface_codellama",
+            "tutorials/huggingface_tutorial",
+            "tutorials/TogetherAI_liteLLM",
+            "tutorials/finetuned_chat_gpt",
+            "tutorials/text_completion",
+            "tutorials/first_playground",
+            "tutorials/model_fallbacks",
+          ],
+        },
+      ]
+    },
+    {
+      type: "category",
+      label: "Contributing",
+      items: [
+        "extras/contributing_code",
+        {
+          type: "category",
+          label: "Adding Providers",
+          items: [
+            "adding_provider/directory_structure",
+            "adding_provider/new_rerank_provider"],
+        },
+        "extras/contributing",
+        "contributing",
+      ]
+    },
    {
      type: "category",
      label: "Extras",
      items: [
-        "extras/contributing",
        "data_security",
        "data_retention",
        "migration_policy",
@ -445,6 +473,7 @@ const sidebars = {
          items: [
            "projects/smolagents",
            "projects/Docq.AI",
+            "projects/PDL",
            "projects/OpenInterpreter",
            "projects/Elroy",
            "projects/dbally",
@ -460,9 +489,9 @@ const sidebars = {
            "projects/YiVal",
            "projects/LiteLLM Proxy",
            "projects/llm_cord",
+            "projects/pgai",
          ],
        },
-        "contributing",
        "proxy/pii_masking",
        "extras/code_quality",
        "rules",
--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -163,7 +163,7 @@ class AporiaGuardrail(CustomGuardrail):

        pass

-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -173,6 +173,7 @@ class AporiaGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -94,6 +94,7 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -107,6 +107,7 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -126,6 +126,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -31,7 +31,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):

    #### CALL HOOKS - proxy only ####

-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -41,6 +41,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        text = ""
--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,12 +8,14 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
+from litellm.caching.llm_caching_handler import LLMClientCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm.types.utils import (
    ImageObject,
    BudgetConfig,
    all_litellm_params,
    all_litellm_params as _litellm_completion_params,
+    CredentialItem,
 )  # maintain backwards compatibility for root param
 from litellm._logging import (
    set_verbose,
@ -53,6 +55,7 @@ from litellm.constants import (
    cohere_embedding_models,
    bedrock_embedding_models,
    known_tokenizer_config,
+    BEDROCK_INVOKE_PROVIDERS_LITERAL,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@ -181,6 +184,7 @@ cloudflare_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
+snowflake_key: Optional[str] = None
 common_cloud_provider_auth_params: dict = {
    "params": ["project", "region_name", "token"],
    "providers": ["vertex_ai", "bedrock", "watsonx", "azure", "vertex_ai_beta"],
@ -190,15 +194,17 @@ ssl_verify: Union[str, bool] = True
 ssl_certificate: Optional[str] = None
 disable_streaming_logging: bool = False
 disable_add_transform_inline_image_block: bool = False
-in_memory_llm_clients_cache: InMemoryCache = InMemoryCache()
+in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
 safe_memory_mode: bool = False
 enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
-AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
+AZURE_DEFAULT_API_VERSION = "2025-02-01-preview"  # this is updated to the latest
 ### DEFAULT WATSONX API VERSION ###
 WATSONX_DEFAULT_API_VERSION = "2024-03-13"
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
 COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
+### CREDENTIALS ###
+credential_list: List[CredentialItem] = []
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
@ -278,8 +284,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-
-
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@ -363,17 +367,7 @@ BEDROCK_CONVERSE_MODELS = [
    "meta.llama3-2-11b-instruct-v1:0",
    "meta.llama3-2-90b-instruct-v1:0",
 ]
-BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
-    "cohere",
-    "anthropic",
-    "mistral",
-    "amazon",
-    "meta",
-    "llama",
-    "ai21",
-    "nova",
-    "deepseek_r1",
-]
+
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models: List = []
 open_ai_text_completion_models: List = []
@ -425,6 +419,7 @@ cerebras_models: List = []
 galadriel_models: List = []
 sambanova_models: List = []
 assemblyai_models: List = []
+snowflake_models: List = []


 def is_bedrock_pricing_only_model(key: str) -> bool:
@ -578,6 +573,8 @@ def add_known_models():
            assemblyai_models.append(key)
        elif value.get("litellm_provider") == "jina_ai":
            jina_ai_models.append(key)
+        elif value.get("litellm_provider") == "snowflake":
+            snowflake_models.append(key)


 add_known_models()
@ -607,6 +604,7 @@ ollama_models = ["llama2"]

 maritalk_models = ["maritalk"]

+
 model_list = (
    open_ai_chat_completion_models
    + open_ai_text_completion_models
@ -651,6 +649,7 @@ model_list = (
    + azure_text_models
    + assemblyai_models
    + jina_ai_models
+    + snowflake_models
 )

 model_list_set = set(model_list)
@ -706,6 +705,7 @@ models_by_provider: dict = {
    "sambanova": sambanova_models,
    "assemblyai": assemblyai_models,
    "jina_ai": jina_ai_models,
+    "snowflake": snowflake_models,
 }

 # mapping for those models which have larger equivalents
@ -811,9 +811,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
-from .llms.anthropic.experimental_pass_through.transformation import (
-    AnthropicExperimentalPassThroughConfig,
-)
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -825,6 +822,7 @@ from .llms.databricks.embed.transformation import DatabricksEmbeddingConfig
 from .llms.predibase.chat.transformation import PredibaseConfig
 from .llms.replicate.chat.transformation import ReplicateConfig
 from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig
+from .llms.snowflake.chat.transformation import SnowflakeConfig
 from .llms.cohere.rerank.transformation import CohereRerankConfig
 from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config
 from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
@ -832,6 +830,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
 from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
+from .llms.anthropic.experimental_pass_through.messages.transformation import (
+    AnthropicMessagesConfig,
+)
 from .llms.together_ai.chat import TogetherAIConfig
 from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
 from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -912,6 +913,7 @@ from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import

 from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
 from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
+from .llms.bedrock.image.amazon_nova_canvas_transformation import AmazonNovaCanvasConfig
 from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
 from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
    AmazonTitanMultimodalEmbeddingG1Config,
@ -934,11 +936,14 @@ from .llms.groq.chat.transformation import GroqChatConfig
 from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
+from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from .llms.openai.chat.o_series_transformation import (
    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
    OpenAIOSeriesConfig,
 )

+from .llms.snowflake.chat.transformation import SnowflakeConfig
+
 openaiOSeriesConfig = OpenAIOSeriesConfig()
 from .llms.openai.chat.gpt_transformation import (
    OpenAIGPTConfig,
@ -1022,6 +1027,8 @@ from .assistants.main import *
 from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
+from .llms.anthropic.experimental_pass_through.messages.handler import *
+from .responses.main import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -182,9 +182,7 @@ def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:
                "REDIS_CLUSTER_NODES environment variable is not valid JSON. Please ensure it's properly formatted."
            )

-    verbose_logger.debug(
-        "init_redis_cluster: startup nodes are being initialized."
-    )
+    verbose_logger.debug("init_redis_cluster: startup nodes are being initialized.")
    from redis.cluster import ClusterNode

    args = _get_redis_cluster_kwargs()
@ -307,7 +305,6 @@ def get_redis_async_client(
        return _init_async_redis_sentinel(redis_kwargs)

    return async_redis.Redis(
-        socket_timeout=5,
        **redis_kwargs,
    )

--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -1,186 +0,0 @@
-# What is this?
-## Translates OpenAI call to Anthropic `/v1/messages` format
-import traceback
-from typing import Any, Optional
-
-import litellm
-from litellm import ChatCompletionRequest, verbose_logger
-from litellm.integrations.custom_logger import CustomLogger
-from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
-from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
-
-
-class AnthropicAdapter(CustomLogger):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def translate_completion_input_params(
-        self, kwargs
-    ) -> Optional[ChatCompletionRequest]:
-        """
-        - translate params, where needed
-        - pass rest, as is
-        """
-        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
-
-        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
-            anthropic_message_request=request_body
-        )
-
-        return translated_body
-
-    def translate_completion_output_params(
-        self, response: ModelResponse
-    ) -> Optional[AnthropicResponse]:
-
-        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
-            response=response
-        )
-
-    def translate_completion_output_params_streaming(
-        self, completion_stream: Any
-    ) -> AdapterCompletionStreamWrapper | None:
-        return AnthropicStreamWrapper(completion_stream=completion_stream)
-
-
-anthropic_adapter = AnthropicAdapter()
-
-
-class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
-    """
-    - first chunk return 'message_start'
-    - content block must be started and stopped
-    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
-    """
-
-    sent_first_chunk: bool = False
-    sent_content_block_start: bool = False
-    sent_content_block_finish: bool = False
-    sent_last_message: bool = False
-    holding_chunk: Optional[Any] = None
-
-    def __next__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-
-            for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except Exception as e:
-            verbose_logger.error(
-                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
-            )
-
-    async def __anext__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-            async for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopAsyncIteration
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -15,6 +15,7 @@ import litellm
 from litellm.types.router import GenericLiteLLMParams
 from litellm.utils import (
    exception_type,
+    get_litellm_params,
    get_llm_provider,
    get_secret,
    supports_httpx_timeout,
@ -86,6 +87,7 @@ def get_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -169,6 +171,7 @@ def get_assistants(
            max_retries=optional_params.max_retries,
            client=client,
            aget_assistants=aget_assistants,  # type: ignore
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -270,6 +273,7 @@ def create_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -371,6 +375,7 @@ def create_assistants(
            client=client,
            async_create_assistants=async_create_assistants,
            create_assistant_data=create_assistant_data,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -445,6 +450,8 @@ def delete_assistant(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )

+    litellm_params_dict = get_litellm_params(**kwargs)
+
    async_delete_assistants: Optional[bool] = kwargs.pop(
        "async_delete_assistants", None
    )
@ -544,6 +551,7 @@ def delete_assistant(
            max_retries=optional_params.max_retries,
            client=client,
            async_delete_assistants=async_delete_assistants,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -639,6 +647,7 @@ def create_thread(
    """
    acreate_thread = kwargs.get("acreate_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -731,6 +740,7 @@ def create_thread(
            max_retries=optional_params.max_retries,
            client=client,
            acreate_thread=acreate_thread,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -795,7 +805,7 @@ def get_thread(
    """Get the thread object, given a thread_id"""
    aget_thread = kwargs.pop("aget_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
-
+    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
    # set timeout for 10 minutes by default
@ -884,6 +894,7 @@ def get_thread(
            max_retries=optional_params.max_retries,
            client=client,
            aget_thread=aget_thread,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -972,6 +983,7 @@ def add_message(
    _message_data = MessageData(
        role=role, content=content, attachments=attachments, metadata=metadata
    )
+    litellm_params_dict = get_litellm_params(**kwargs)
    optional_params = GenericLiteLLMParams(**kwargs)

    message_data = get_optional_params_add_message(
@ -1068,6 +1080,7 @@ def add_message(
            max_retries=optional_params.max_retries,
            client=client,
            a_add_message=a_add_message,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1139,6 +1152,7 @@ def get_messages(
 ) -> SyncCursorPage[OpenAIMessage]:
    aget_messages = kwargs.pop("aget_messages", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1225,6 +1239,7 @@ def get_messages(
            max_retries=optional_params.max_retries,
            client=client,
            aget_messages=aget_messages,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1337,6 +1352,7 @@ def run_thread(
    """Run a given thread + assistant."""
    arun_thread = kwargs.pop("arun_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1437,6 +1453,7 @@ def run_thread(
            max_retries=optional_params.max_retries,
            client=client,
            arun_thread=arun_thread,
+            litellm_params=litellm_params_dict,
        )  # type: ignore
    else:
        raise litellm.exceptions.BadRequestError(
--- a/litellm/batches/batch_utils.py
+++ b/litellm/batches/batch_utils.py
@ -1,76 +1,16 @@
-import asyncio
-import datetime
 import json
-import threading
-from typing import Any, List, Literal, Optional
+from typing import Any, List, Literal, Tuple

 import litellm
 from litellm._logging import verbose_logger
-from litellm.constants import (
-    BATCH_STATUS_POLL_INTERVAL_SECONDS,
-    BATCH_STATUS_POLL_MAX_ATTEMPTS,
-)
-from litellm.files.main import afile_content
-from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.llms.openai import Batch
-from litellm.types.utils import StandardLoggingPayload, Usage
-
-
-async def batches_async_logging(
-    batch_id: str,
-    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
-    logging_obj: Optional[LiteLLMLoggingObj] = None,
-    **kwargs,
-):
-    """
-    Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
-
-
-    Polls retrieve_batch until it returns a batch with status "completed" or "failed"
-    """
-    from .main import aretrieve_batch
-
-    verbose_logger.debug(
-        ".....in _batches_async_logging... polling retrieve to get batch status"
-    )
-    if logging_obj is None:
-        raise ValueError(
-            "logging_obj is None cannot calculate cost / log batch creation event"
-        )
-    for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
-        try:
-            start_time = datetime.datetime.now()
-            batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
-            verbose_logger.debug(
-                "in _batches_async_logging... batch status= %s", batch.status
-            )
-
-            if batch.status == "completed":
-                end_time = datetime.datetime.now()
-                await _handle_completed_batch(
-                    batch=batch,
-                    custom_llm_provider=custom_llm_provider,
-                    logging_obj=logging_obj,
-                    start_time=start_time,
-                    end_time=end_time,
-                    **kwargs,
-                )
-                break
-            elif batch.status == "failed":
-                pass
-        except Exception as e:
-            verbose_logger.error("error in batches_async_logging", e)
-        await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
+from litellm.types.utils import CallTypes, Usage


 async def _handle_completed_batch(
    batch: Batch,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
-    logging_obj: LiteLLMLoggingObj,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
+) -> Tuple[float, Usage, List[str]]:
    """Helper function to process a completed batch and handle logging"""
    # Get batch results
    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,49 +27,25 @@ async def _handle_completed_batch(
        custom_llm_provider=custom_llm_provider,
    )

-    # Handle logging
-    await _log_completed_batch(
-        logging_obj=logging_obj,
-        batch_usage=batch_usage,
-        batch_cost=batch_cost,
-        start_time=start_time,
-        end_time=end_time,
-        **kwargs,
-    )
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
+
+    return batch_cost, batch_usage, batch_models


-async def _log_completed_batch(
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage: Usage,
-    batch_cost: float,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
-    """Helper function to handle all logging operations for a completed batch"""
-    logging_obj.call_type = "batch_success"
-
-    standard_logging_object = _create_standard_logging_object_for_completed_batch(
-        kwargs=kwargs,
-        start_time=start_time,
-        end_time=end_time,
-        logging_obj=logging_obj,
-        batch_usage_object=batch_usage,
-        response_cost=batch_cost,
-    )
-
-    logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
-
-    # Launch async and sync logging handlers
-    asyncio.create_task(
-        logging_obj.async_success_handler(
-            result=None,
-            start_time=start_time,
-            end_time=end_time,
-            cache_hit=None,
-        )
-    )
-    logging_obj.success_handler(None, start_time, end_time)
+def _get_batch_models_from_file_content(
+    file_content_dictionary: List[dict],
+) -> List[str]:
+    """
+    Get the models from the file content
+    """
+    batch_models = []
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            _model = _response_body.get("model")
+            if _model:
+                batch_models.append(_model)
+    return batch_models


 async def _batch_cost_calculator(
@ -156,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
    """
    Get the batch output file content as a list of dictionaries
    """
+    from litellm.files.main import afile_content
+
    if custom_llm_provider == "vertex_ai":
        raise ValueError("Vertex AI does not support file content retrieval")

@ -205,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
                total_cost += litellm.completion_cost(
                    completion_response=_response_body,
                    custom_llm_provider=custom_llm_provider,
+                    call_type=CallTypes.aretrieve_batch.value,
                )
                verbose_logger.debug("total_cost=%s", total_cost)
        return total_cost
@ -261,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
    """
    _response: dict = batch_job_output_file.get("response", None) or {}
    return _response.get("status_code", None) == 200
-
-
-def _create_standard_logging_object_for_completed_batch(
-    kwargs: dict,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage_object: Usage,
-    response_cost: float,
-) -> StandardLoggingPayload:
-    """
-    Create a standard logging object for a completed batch
-    """
-    standard_logging_object = logging_obj.model_call_details.get(
-        "standard_logging_object", None
-    )
-
-    if standard_logging_object is None:
-        raise ValueError("unable to create standard logging object for completed batch")
-
-    # Add Completed Batch Job Usage and Response Cost
-    standard_logging_object["call_type"] = "batch_success"
-    standard_logging_object["response_cost"] = response_cost
-    standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
-    standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
-    standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
-    return standard_logging_object
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
    RetrieveBatchRequest,
 )
 from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LiteLLMBatch
 from litellm.utils import client, get_litellm_params, supports_httpx_timeout

-from .batch_utils import batches_async_logging
-
 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
 azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
        else:
            response = init_response

-        # Start async logging job
-        if response is not None:
-            asyncio.create_task(
-                batches_async_logging(
-                    logging_obj=kwargs.get("litellm_logging_obj", None),
-                    batch_id=response.id,
-                    custom_llm_provider=custom_llm_provider,
-                    **kwargs,
-                )
-            )
-
        return response
    except Exception as e:
        raise e
@ -111,7 +99,7 @@ def create_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Creates and executes a batch from an uploaded file of request

@ -119,21 +107,27 @@ def create_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_call_id = kwargs.get("litellm_call_id", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        model_info = kwargs.get("model_info", None)
        _is_async = kwargs.pop("acreate_batch", False) is True
+        litellm_params = get_litellm_params(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        litellm_params = get_litellm_params(
-            custom_llm_provider=custom_llm_provider,
-            litellm_call_id=kwargs.get("litellm_call_id", None),
-            litellm_trace_id=kwargs.get("litellm_trace_id"),
-            litellm_metadata=kwargs.get("litellm_metadata"),
-        )
        litellm_logging_obj.update_environment_variables(
            model=None,
            user=None,
            optional_params=optional_params.model_dump(),
-            litellm_params=litellm_params,
+            litellm_params={
+                "litellm_call_id": litellm_call_id,
+                "proxy_server_request": proxy_server_request,
+                "model_info": model_info,
+                "metadata": metadata,
+                "preset_cache_key": None,
+                "stream_response": {},
+                **optional_params.model_dump(exclude_unset=True),
+            },
            custom_llm_provider=custom_llm_provider,
        )

@ -224,6 +218,7 @@ def create_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_batch_data=_create_batch_request,
+                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -261,7 +256,7 @@ def create_batch(
                response=httpx.Response(
                    status_code=400,
                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
                ),
            )
        return response
@ -269,6 +264,7 @@ def create_batch(
        raise e


+@client
 async def aretrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +272,7 @@ async def aretrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Batch:
+) -> LiteLLMBatch:
    """
    Async: Retrieves a batch.

@ -310,6 +306,7 @@ async def aretrieve_batch(
        raise e


+@client
 def retrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +314,7 @@ def retrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Retrieves a batch.

@ -325,9 +322,20 @@ def retrieve_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        # set timeout for 10 minutes by default
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
+        litellm_logging_obj.update_environment_variables(
+            model=None,
+            user=None,
+            optional_params=optional_params.model_dump(),
+            litellm_params=litellm_params,
+            custom_llm_provider=custom_llm_provider,
+        )

        if (
            timeout is not None
@ -415,6 +423,7 @@ def retrieve_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                retrieve_batch_data=_retrieve_batch_request,
+                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -517,6 +526,10 @@ def list_batches(
    try:
        # set API KEY
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
        api_key = (
            optional_params.api_key
            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
@ -594,6 +607,7 @@ def list_batches(
                api_version=api_version,
                timeout=timeout,
                max_retries=optional_params.max_retries,
+                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
@ -669,6 +683,10 @@ def cancel_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        # set timeout for 10 minutes by default
@ -756,6 +774,7 @@ def cancel_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                cancel_batch_data=_cancel_batch_request,
+                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -13,26 +13,14 @@ import json
 import time
 import traceback
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Union

-from openai.types.audio.transcription_create_params import TranscriptionCreateParams
-from openai.types.chat.completion_create_params import (
-    CompletionCreateParamsNonStreaming,
-    CompletionCreateParamsStreaming,
-)
-from openai.types.completion_create_params import (
-    CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
-)
-from openai.types.completion_create_params import (
-    CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
-)
-from openai.types.embedding_create_params import EmbeddingCreateParams
 from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
-from litellm.types.rerank import RerankRequest
 from litellm.types.utils import all_litellm_params

 from .base_cache import BaseCache
@ -257,7 +245,7 @@ class Cache:
            verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
            return preset_cache_key

-        combined_kwargs = self._get_relevant_args_to_use_for_cache_key()
+        combined_kwargs = ModelParamHelper._get_all_llm_api_params()
        litellm_param_kwargs = all_litellm_params
        for param in kwargs:
            if param in combined_kwargs:
@ -364,76 +352,6 @@ class Cache:
            if "litellm_params" in kwargs:
                kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key

-    def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
-        """
-        Gets the supported kwargs for each call type and combines them
-        """
-        chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
-        text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
-        embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
-        transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
-        rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
-        exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
-
-        combined_kwargs = chat_completion_kwargs.union(
-            text_completion_kwargs,
-            embedding_kwargs,
-            transcription_kwargs,
-            rerank_kwargs,
-        )
-        combined_kwargs = combined_kwargs.difference(exclude_kwargs)
-        return combined_kwargs
-
-    def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported chat completion kwargs
-
-        This follows the OpenAI API Spec
-        """
-        all_chat_completion_kwargs = set(
-            CompletionCreateParamsNonStreaming.__annotations__.keys()
-        ).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
-        return all_chat_completion_kwargs
-
-    def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported text completion kwargs
-
-        This follows the OpenAI API Spec
-        """
-        all_text_completion_kwargs = set(
-            TextCompletionCreateParamsNonStreaming.__annotations__.keys()
-        ).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
-        return all_text_completion_kwargs
-
-    def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported rerank kwargs
-        """
-        return set(RerankRequest.model_fields.keys())
-
-    def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported embedding kwargs
-
-        This follows the OpenAI API Spec
-        """
-        return set(EmbeddingCreateParams.__annotations__.keys())
-
-    def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
-        """
-        Get the litellm supported transcription kwargs
-
-        This follows the OpenAI API Spec
-        """
-        return set(TranscriptionCreateParams.__annotations__.keys())
-
-    def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
-        """
-        Get the kwargs to exclude from the cache key
-        """
-        return set(["metadata"])
-
    @staticmethod
    def _get_hashed_cache_key(cache_key: str) -> str:
        """
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -247,7 +247,6 @@ class LLMCachingHandler:
                    pass
                else:
                    call_type = original_function.__name__
-
                    cached_result = self._convert_cached_result_to_model_response(
                        cached_result=cached_result,
                        call_type=call_type,
@ -719,6 +718,7 @@ class LLMCachingHandler:
        """
        Sync internal method to add the result to the cache
        """
+
        new_kwargs = kwargs.copy()
        new_kwargs.update(
            convert_args_to_kwargs(
@ -732,6 +732,7 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
+
            litellm.cache.add_cache(result, **new_kwargs)

        return
@ -783,6 +784,7 @@ class LLMCachingHandler:
        - Else append the chunk to self.async_streaming_chunks

        """
+
        complete_streaming_response: Optional[
            Union[ModelResponse, TextCompletionResponse]
        ] = _assemble_complete_response_from_streaming_chunks(
@ -793,7 +795,6 @@ class LLMCachingHandler:
            streaming_chunks=self.async_streaming_chunks,
            is_async=True,
        )
-
        # if a complete_streaming_response is assembled, add it to the cache
        if complete_streaming_response is not None:
            await self.async_set_cache(
--- a/litellm/caching/llm_caching_handler.py
+++ b/litellm/caching/llm_caching_handler.py
@ -0,0 +1,40 @@
+"""
+Add the event loop to the cache key, to prevent event loop closed errors.
+"""
+
+import asyncio
+
+from .in_memory_cache import InMemoryCache
+
+
+class LLMClientCache(InMemoryCache):
+
+    def update_cache_key_with_event_loop(self, key):
+        """
+        Add the event loop to the cache key, to prevent event loop closed errors.
+        If none, use the key as is.
+        """
+        try:
+            event_loop = asyncio.get_event_loop()
+            stringified_event_loop = str(id(event_loop))
+            return f"{key}-{stringified_event_loop}"
+        except Exception:  # handle no current event loop
+            return key
+
+    def set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return super().set_cache(key, value, **kwargs)
+
+    async def async_set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return await super().async_set_cache(key, value, **kwargs)
+
+    def get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+
+        return super().get_cache(key, **kwargs)
+
+    async def async_get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+
+        return await super().async_get_cache(key, **kwargs)
--- a/litellm/caching/redis_cache.py
+++ b/litellm/caching/redis_cache.py
@ -54,6 +54,7 @@ class RedisCache(BaseCache):
        redis_flush_size: Optional[int] = 100,
        namespace: Optional[str] = None,
        startup_nodes: Optional[List] = None,  # for redis-cluster
+        socket_timeout: Optional[float] = 5.0,  # default 5 second timeout
        **kwargs,
    ):

@ -70,6 +71,9 @@ class RedisCache(BaseCache):
            redis_kwargs["password"] = password
        if startup_nodes is not None:
            redis_kwargs["startup_nodes"] = startup_nodes
+        if socket_timeout is not None:
+            redis_kwargs["socket_timeout"] = socket_timeout
+
        ### HEALTH MONITORING OBJECT ###
        if kwargs.get("service_logger_obj", None) is not None and isinstance(
            kwargs["service_logger_obj"], ServiceLogging
@ -543,6 +547,7 @@ class RedisCache(BaseCache):
        _redis_client: Redis = self.init_async_client()  # type: ignore
        start_time = time.time()
        _used_ttl = self.get_ttl(ttl=ttl)
+        key = self.check_and_fix_namespace(key=key)
        try:
            result = await _redis_client.incrbyfloat(name=key, amount=value)
            if _used_ttl is not None:
@ -555,6 +560,7 @@ class RedisCache(BaseCache):
            ## LOGGING ##
            end_time = time.time()
            _duration = end_time - start_time
+
            asyncio.create_task(
                self.service_logger_obj.async_service_success_hook(
                    service=ServiceTypes.REDIS,
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Literal

 ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
@ -18,6 +18,7 @@ SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
+STREAM_SSE_DONE_STRING: str = "[DONE]"

 LITELLM_CHAT_PROVIDERS = [
    "openai",
@ -320,6 +321,17 @@ baseten_models: List = [
    "31dxrj3",
 ]  # FALCON 7B  # WizardLM  # Mosaic ML

+BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
+    "cohere",
+    "anthropic",
+    "mistral",
+    "amazon",
+    "meta",
+    "llama",
+    "ai21",
+    "nova",
+    "deepseek_r1",
+]

 open_ai_embedding_models: List = ["text-embedding-ada-002"]
 cohere_embedding_models: List = [
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -44,7 +44,12 @@ from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_ro
 from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
-from litellm.types.llms.openai import HttpxBinaryResponseContent
+from litellm.responses.utils import ResponseAPILoggingUtils
+from litellm.types.llms.openai import (
+    HttpxBinaryResponseContent,
+    ResponseAPIUsage,
+    ResponsesAPIResponse,
+)
 from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
@ -239,6 +244,15 @@ def cost_per_token(  # noqa: PLR0915
            custom_llm_provider=custom_llm_provider,
            billed_units=rerank_billed_units,
        )
+    elif (
+        call_type == "aretrieve_batch"
+        or call_type == "retrieve_batch"
+        or call_type == CallTypes.aretrieve_batch
+        or call_type == CallTypes.retrieve_batch
+    ):
+        return batch_cost_calculator(
+            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
+        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
            model=model,
@ -399,9 +413,12 @@ def _select_model_name_for_cost_calc(
    if base_model is not None:
        return_model = base_model

-    completion_response_model: Optional[str] = getattr(
-        completion_response, "model", None
-    )
+    completion_response_model: Optional[str] = None
+    if completion_response is not None:
+        if isinstance(completion_response, BaseModel):
+            completion_response_model = getattr(completion_response, "model", None)
+        elif isinstance(completion_response, dict):
+            completion_response_model = completion_response.get("model", None)
    hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
    if completion_response_model is None and hidden_params is not None:
        if (
@ -452,6 +469,13 @@ def _get_usage_object(
    return usage_obj


+def _is_known_usage_objects(usage_obj):
+    """Returns True if the usage obj is a known Usage type"""
+    return isinstance(usage_obj, litellm.Usage) or isinstance(
+        usage_obj, ResponseAPIUsage
+    )
+
+
 def _infer_call_type(
    call_type: Optional[CallTypesLiteral], completion_response: Any
 ) -> Optional[CallTypesLiteral]:
@ -561,9 +585,7 @@ def completion_cost(  # noqa: PLR0915
            base_model=base_model,
        )

-        verbose_logger.debug(
-            f"completion_response _select_model_name_for_cost_calc: {model}"
-        )
+        verbose_logger.info(f"selected model name for cost calculation: {model}")

        if completion_response is not None and (
            isinstance(completion_response, BaseModel)
@ -575,8 +597,8 @@ def completion_cost(  # noqa: PLR0915
                )
            else:
                usage_obj = getattr(completion_response, "usage", {})
-            if isinstance(usage_obj, BaseModel) and not isinstance(
-                usage_obj, litellm.Usage
+            if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
+                usage_obj=usage_obj
            ):
                setattr(
                    completion_response,
@ -589,6 +611,14 @@ def completion_cost(  # noqa: PLR0915
                _usage = usage_obj.model_dump()
            else:
                _usage = usage_obj
+
+            if ResponseAPILoggingUtils._is_response_api_usage(_usage):
+                _usage = (
+                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                        _usage
+                    ).model_dump()
+                )
+
            # get input/output tokens from completion_response
            prompt_tokens = _usage.get("prompt_tokens", 0)
            completion_tokens = _usage.get("completion_tokens", 0)
@ -778,6 +808,23 @@ def completion_cost(  # noqa: PLR0915
        raise e


+def get_response_cost_from_hidden_params(
+    hidden_params: Union[dict, BaseModel]
+) -> Optional[float]:
+    if isinstance(hidden_params, BaseModel):
+        _hidden_params_dict = hidden_params.model_dump()
+    else:
+        _hidden_params_dict = hidden_params
+
+    additional_headers = _hidden_params_dict.get("additional_headers", {})
+    if additional_headers and "x-litellm-response-cost" in additional_headers:
+        response_cost = additional_headers["x-litellm-response-cost"]
+        if response_cost is None:
+            return None
+        return float(additional_headers["x-litellm-response-cost"])
+    return None
+
+
 def response_cost_calculator(
    response_object: Union[
        ModelResponse,
@ -787,6 +834,7 @@ def response_cost_calculator(
        TextCompletionResponse,
        HttpxBinaryResponseContent,
        RerankResponse,
+        ResponsesAPIResponse,
    ],
    model: str,
    custom_llm_provider: Optional[str],
@ -813,7 +861,7 @@ def response_cost_calculator(
    base_model: Optional[str] = None,
    custom_pricing: Optional[bool] = None,
    prompt: str = "",
-) -> Optional[float]:
+) -> float:
    """
    Returns
    - float or None: cost of response
@ -825,6 +873,14 @@ def response_cost_calculator(
        else:
            if isinstance(response_object, BaseModel):
                response_object._hidden_params["optional_params"] = optional_params
+
+                if hasattr(response_object, "_hidden_params"):
+                    provider_response_cost = get_response_cost_from_hidden_params(
+                        response_object._hidden_params
+                    )
+                    if provider_response_cost is not None:
+                        return provider_response_cost
+
            response_cost = completion_cost(
                completion_response=response_object,
                model=model,
@ -957,3 +1013,54 @@ def default_image_cost_calculator(
            )

    return cost_info["input_cost_per_pixel"] * height * width * n
+
+
+def batch_cost_calculator(
+    usage: Usage,
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+) -> Tuple[float, float]:
+    """
+    Calculate the cost of a batch job
+    """
+
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    verbose_logger.info(
+        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
+        model,
+        custom_llm_provider,
+    )
+
+    try:
+        model_info: Optional[ModelInfo] = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+    except Exception:
+        model_info = None
+
+    if not model_info:
+        return 0.0, 0.0
+
+    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
+    input_cost_per_token = model_info.get("input_cost_per_token")
+    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
+    output_cost_per_token = model_info.get("output_cost_per_token")
+    total_prompt_cost = 0.0
+    total_completion_cost = 0.0
+    if input_cost_per_token_batches:
+        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
+    elif input_cost_per_token:
+        total_prompt_cost = (
+            usage.prompt_tokens * (input_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    if output_cost_per_token_batches:
+        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
+    elif output_cost_per_token:
+        total_completion_cost = (
+            usage.completion_tokens * (output_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+
+    return total_prompt_cost, total_completion_cost
--- a/Show more
+++ b/Show more