Merge branch 'main' into sync-logging

This commit is contained in:
Yuki Watanabe 2025-03-20 01:48:22 +09:00 committed by GitHub
commit a3fc795927
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
484 changed files with 27932 additions and 7615 deletions

View file

@ -49,7 +49,7 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.54.0 pip install openai==1.66.1
pip install prisma==0.11.0 pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
@ -71,7 +71,7 @@ jobs:
pip install "Pillow==10.3.0" pip install "Pillow==10.3.0"
pip install "jsonschema==4.22.0" pip install "jsonschema==4.22.0"
pip install "pytest-xdist==3.6.1" pip install "pytest-xdist==3.6.1"
pip install "websockets==10.4" pip install "websockets==13.1.0"
pip uninstall posthog -y pip uninstall posthog -y
- save_cache: - save_cache:
paths: paths:
@ -168,7 +168,7 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.54.0 pip install openai==1.66.1
pip install prisma==0.11.0 pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
@ -189,6 +189,7 @@ jobs:
pip install "diskcache==5.6.1" pip install "diskcache==5.6.1"
pip install "Pillow==10.3.0" pip install "Pillow==10.3.0"
pip install "jsonschema==4.22.0" pip install "jsonschema==4.22.0"
pip install "websockets==13.1.0"
- save_cache: - save_cache:
paths: paths:
- ./venv - ./venv
@ -267,7 +268,7 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.54.0 pip install openai==1.66.1
pip install prisma==0.11.0 pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
@ -288,6 +289,7 @@ jobs:
pip install "diskcache==5.6.1" pip install "diskcache==5.6.1"
pip install "Pillow==10.3.0" pip install "Pillow==10.3.0"
pip install "jsonschema==4.22.0" pip install "jsonschema==4.22.0"
pip install "websockets==13.1.0"
- save_cache: - save_cache:
paths: paths:
- ./venv - ./venv
@ -511,7 +513,7 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.54.0 pip install openai==1.66.1
pip install prisma==0.11.0 pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
@ -678,6 +680,48 @@ jobs:
paths: paths:
- llm_translation_coverage.xml - llm_translation_coverage.xml
- llm_translation_coverage - llm_translation_coverage
llm_responses_api_testing:
docker:
- image: cimg/python:3.11
auth:
username: ${DOCKERHUB_USERNAME}
password: ${DOCKERHUB_PASSWORD}
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-cov==5.0.0"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
- run:
name: Rename the coverage files
command: |
mv coverage.xml llm_responses_api_coverage.xml
mv .coverage llm_responses_api_coverage
# Store test results
- store_test_results:
path: test-results
- persist_to_workspace:
root: .
paths:
- llm_responses_api_coverage.xml
- llm_responses_api_coverage
litellm_mapped_tests: litellm_mapped_tests:
docker: docker:
- image: cimg/python:3.11 - image: cimg/python:3.11
@ -1234,7 +1278,7 @@ jobs:
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "openai==1.54.0 " pip install "openai==1.66.1"
- run: - run:
name: Install Grype name: Install Grype
command: | command: |
@ -1309,13 +1353,13 @@ jobs:
command: | command: |
pwd pwd
ls ls
python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
no_output_timeout: 120m no_output_timeout: 120m
# Store test results # Store test results
- store_test_results: - store_test_results:
path: test-results path: test-results
e2e_openai_misc_endpoints: e2e_openai_endpoints:
machine: machine:
image: ubuntu-2204:2023.10.1 image: ubuntu-2204:2023.10.1
resource_class: xlarge resource_class: xlarge
@ -1370,7 +1414,7 @@ jobs:
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "openai==1.54.0 " pip install "openai==1.66.1"
# Run pytest and generate JUnit XML report # Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
@ -1432,7 +1476,7 @@ jobs:
command: | command: |
pwd pwd
ls ls
python -m pytest -s -vv tests/openai_misc_endpoints_tests --junitxml=test-results/junit.xml --durations=5 python -m pytest -s -vv tests/openai_endpoints_tests --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m no_output_timeout: 120m
# Store test results # Store test results
@ -1492,7 +1536,7 @@ jobs:
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "openai==1.54.0 " pip install "openai==1.66.1"
- run: - run:
name: Build Docker image name: Build Docker image
command: docker build -t my-app:latest -f ./docker/Dockerfile.database . command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1921,7 +1965,7 @@ jobs:
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install "google-cloud-aiplatform==1.43.0" pip install "google-cloud-aiplatform==1.43.0"
pip install aiohttp pip install aiohttp
pip install "openai==1.54.0 " pip install "openai==1.66.1"
pip install "assemblyai==0.37.0" pip install "assemblyai==0.37.0"
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install "pydantic==2.7.1" pip install "pydantic==2.7.1"
@ -1935,12 +1979,12 @@ jobs:
pip install prisma pip install prisma
pip install fastapi pip install fastapi
pip install jsonschema pip install jsonschema
pip install "httpx==0.24.1" pip install "httpx==0.27.0"
pip install "anyio==3.7.1" pip install "anyio==3.7.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "google-cloud-aiplatform==1.59.0" pip install "google-cloud-aiplatform==1.59.0"
pip install "anthropic==0.21.3" pip install "anthropic==0.49.0"
# Run pytest and generate JUnit XML report # Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
@ -2068,7 +2112,7 @@ jobs:
python -m venv venv python -m venv venv
. venv/bin/activate . venv/bin/activate
pip install coverage pip install coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
coverage xml coverage xml
- codecov/upload: - codecov/upload:
file: ./coverage.xml file: ./coverage.xml
@ -2197,7 +2241,7 @@ jobs:
pip install "pytest-retry==1.6.3" pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install aiohttp pip install aiohttp
pip install "openai==1.54.0 " pip install "openai==1.66.1"
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install "pydantic==2.7.1" pip install "pydantic==2.7.1"
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
@ -2387,7 +2431,7 @@ workflows:
only: only:
- main - main
- /litellm_.*/ - /litellm_.*/
- e2e_openai_misc_endpoints: - e2e_openai_endpoints:
filters: filters:
branches: branches:
only: only:
@ -2429,6 +2473,12 @@ workflows:
only: only:
- main - main
- /litellm_.*/ - /litellm_.*/
- llm_responses_api_testing:
filters:
branches:
only:
- main
- /litellm_.*/
- litellm_mapped_tests: - litellm_mapped_tests:
filters: filters:
branches: branches:
@ -2468,6 +2518,7 @@ workflows:
- upload-coverage: - upload-coverage:
requires: requires:
- llm_translation_testing - llm_translation_testing
- llm_responses_api_testing
- litellm_mapped_tests - litellm_mapped_tests
- batches_testing - batches_testing
- litellm_utils_testing - litellm_utils_testing
@ -2522,10 +2573,11 @@ workflows:
requires: requires:
- local_testing - local_testing
- build_and_test - build_and_test
- e2e_openai_misc_endpoints - e2e_openai_endpoints
- load_testing - load_testing
- test_bad_database_url - test_bad_database_url
- llm_translation_testing - llm_translation_testing
- llm_responses_api_testing
- litellm_mapped_tests - litellm_mapped_tests
- batches_testing - batches_testing
- litellm_utils_testing - litellm_utils_testing

View file

@ -1,5 +1,5 @@
# used by CI/CD testing # used by CI/CD testing
openai==1.54.0 openai==1.66.1
python-dotenv python-dotenv
tiktoken tiktoken
importlib_metadata importlib_metadata

View file

@ -6,6 +6,16 @@
<!-- e.g. "Fixes #000" --> <!-- e.g. "Fixes #000" -->
## Pre-Submission checklist
**Please complete all items before asking a LiteLLM maintainer to review your PR**
- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
- [ ] I have added a screenshot of my new test passing locally
- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
## Type ## Type
<!-- Select the type of Pull Request --> <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@
## Changes ## Changes
<!-- List of changes -->
## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
If UI changes, send a screenshot/GIF of working UI fixes
<!-- Test procedure -->

View file

@ -80,7 +80,6 @@ jobs:
permissions: permissions:
contents: read contents: read
packages: write packages: write
#
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v4 uses: actions/checkout@v4
@ -112,7 +111,11 @@ jobs:
with: with:
context: . context: .
push: true push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest' tags: |
${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }}
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
@ -151,8 +154,12 @@ jobs:
context: . context: .
file: ./docker/Dockerfile.database file: ./docker/Dockerfile.database
push: true push: true
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} tags: |
labels: ${{ steps.meta-database.outputs.labels }} ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }}
labels: ${{ steps.meta-database.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-non_root: build-and-push-image-non_root:
@ -190,7 +197,11 @@ jobs:
context: . context: .
file: ./docker/Dockerfile.non_root file: ./docker/Dockerfile.non_root
push: true push: true
tags: ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} tags: |
${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }}
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }}
labels: ${{ steps.meta-non_root.outputs.labels }} labels: ${{ steps.meta-non_root.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
@ -229,7 +240,11 @@ jobs:
context: . context: .
file: ./litellm-js/spend-logs/Dockerfile file: ./litellm-js/spend-logs/Dockerfile
push: true push: true
tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }} tags: |
${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }},
${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }},
${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-helm-chart: build-and-push-helm-chart:

27
.github/workflows/helm_unit_test.yml vendored Normal file
View file

@ -0,0 +1,27 @@
name: Helm unit test
on:
pull_request:
push:
branches:
- main
jobs:
unit-test:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Helm 3.11.1
uses: azure/setup-helm@v1
with:
version: '3.11.1'
- name: Install Helm Unit Test Plugin
run: |
helm plugin install https://github.com/helm-unittest/helm-unittest --version v0.4.4
- name: Run unit tests
run:
helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm

32
Makefile Normal file
View file

@ -0,0 +1,32 @@
# LiteLLM Makefile
# Simple Makefile for running tests and basic development tasks
.PHONY: help test test-unit test-integration lint format
# Default target
help:
@echo "Available commands:"
@echo " make test - Run all tests"
@echo " make test-unit - Run unit tests"
@echo " make test-integration - Run integration tests"
@echo " make test-unit-helm - Run helm unit tests"
install-dev:
poetry install --with dev
lint: install-dev
poetry run pip install types-requests types-setuptools types-redis types-PyYAML
cd litellm && poetry run mypy . --ignore-missing-imports
# Testing
test:
poetry run pytest tests/
test-unit:
poetry run pytest tests/litellm/
test-integration:
poetry run pytest tests/ -k "not litellm"
test-unit-helm:
helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm

View file

@ -40,7 +40,7 @@ LiteLLM manages:
[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br> [**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs) [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+). Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
@ -340,71 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
## Contributing ## Contributing
To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change. Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)
Here's how to modify the repo locally:
Step 1: Clone the repo
```
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Install dependencies:
```
pip install -r requirements.txt
```
Step 3: Test your change:
a. Add a pytest test within `tests/litellm/`
This folder follows the same directory structure as `litellm/`.
If a corresponding test file does not exist, create one.
b. Run the test
```
cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
pytest /path/to/test_file.py
```
Step 4: Submit a PR with your changes! 🚀
- push your fork to your GitHub repo
- submit a PR from there
### Building LiteLLM Docker Image
Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
Step 1: Clone the repo
```
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Build the Docker Image
Build using Dockerfile.non_root
```
docker build -f docker/Dockerfile.non_root -t litellm_test_image .
```
Step 3: Run the Docker Image
Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
```
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://xxxxxxxx" \
-e LITELLM_MASTER_KEY="sk-1234" \
-p 4000:4000 \
litellm_test_image \
--config /app/config.yaml --detailed_debug
```
# Enterprise # Enterprise
For companies that need better security, user management and professional support For companies that need better security, user management and professional support

View file

@ -18,7 +18,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.4.1 version: 0.4.2
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to

View file

@ -22,6 +22,8 @@ If `db.useStackgresOperator` is used (not yet implemented):
| Name | Description | Value | | Name | Description | Value |
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` | | `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` |
| `masterkeySecretName` | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use the generated secret name. | N/A |
| `masterkeySecretKey` | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use `masterkey` as the key. | N/A |
| `masterkey` | The Master API Key for LiteLLM. If not specified, a random key is generated. | N/A | | `masterkey` | The Master API Key for LiteLLM. If not specified, a random key is generated. | N/A |
| `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | | `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` |
| `environmentConfigMaps` | An optional array of ConfigMap object names. The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | | `environmentConfigMaps` | An optional array of ConfigMap object names. The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` |

View file

@ -78,8 +78,8 @@ spec:
- name: PROXY_MASTER_KEY - name: PROXY_MASTER_KEY
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: {{ include "litellm.fullname" . }}-masterkey name: {{ .Values.masterkeySecretName | default (printf "%s-masterkey" (include "litellm.fullname" .)) }}
key: masterkey key: {{ .Values.masterkeySecretKey | default "masterkey" }}
{{- if .Values.redis.enabled }} {{- if .Values.redis.enabled }}
- name: REDIS_HOST - name: REDIS_HOST
value: {{ include "litellm.redis.serviceName" . }} value: {{ include "litellm.redis.serviceName" . }}

View file

@ -1,3 +1,4 @@
{{- if not .Values.masterkeySecretName }}
{{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }} {{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
apiVersion: v1 apiVersion: v1
kind: Secret kind: Secret
@ -5,4 +6,5 @@ metadata:
name: {{ include "litellm.fullname" . }}-masterkey name: {{ include "litellm.fullname" . }}-masterkey
data: data:
masterkey: {{ $masterkey | b64enc }} masterkey: {{ $masterkey | b64enc }}
type: Opaque type: Opaque
{{- end }}

View file

@ -0,0 +1,82 @@
suite: test deployment
templates:
- deployment.yaml
- configmap-litellm.yaml
tests:
- it: should work
template: deployment.yaml
set:
image.tag: test
asserts:
- isKind:
of: Deployment
- matchRegex:
path: metadata.name
pattern: -litellm$
- equal:
path: spec.template.spec.containers[0].image
value: ghcr.io/berriai/litellm-database:test
- it: should work with tolerations
template: deployment.yaml
set:
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
asserts:
- equal:
path: spec.template.spec.tolerations[0].key
value: node-role.kubernetes.io/master
- equal:
path: spec.template.spec.tolerations[0].operator
value: Exists
- it: should work with affinity
template: deployment.yaml
set:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: topology.kubernetes.io/zone
operator: In
values:
- antarctica-east1
asserts:
- equal:
path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key
value: topology.kubernetes.io/zone
- equal:
path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator
value: In
- equal:
path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]
value: antarctica-east1
- it: should work without masterkeySecretName or masterkeySecretKey
template: deployment.yaml
set:
masterkeySecretName: ""
masterkeySecretKey: ""
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: PROXY_MASTER_KEY
valueFrom:
secretKeyRef:
name: RELEASE-NAME-litellm-masterkey
key: masterkey
- it: should work with masterkeySecretName and masterkeySecretKey
template: deployment.yaml
set:
masterkeySecretName: my-secret
masterkeySecretKey: my-key
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: PROXY_MASTER_KEY
valueFrom:
secretKeyRef:
name: my-secret
key: my-key

View file

@ -0,0 +1,18 @@
suite: test masterkey secret
templates:
- secret-masterkey.yaml
tests:
- it: should create a secret if masterkeySecretName is not set
template: secret-masterkey.yaml
set:
masterkeySecretName: ""
asserts:
- isKind:
of: Secret
- it: should not create a secret if masterkeySecretName is set
template: secret-masterkey.yaml
set:
masterkeySecretName: my-secret
asserts:
- hasDocuments:
count: 0

View file

@ -75,6 +75,12 @@ ingress:
# masterkey: changeit # masterkey: changeit
# if set, use this secret for the master key; otherwise, autogenerate a new one
masterkeySecretName: ""
# if set, use this secret key for the master key; otherwise, use the default key
masterkeySecretKey: ""
# The elements within proxy_config are rendered as config.yaml for the proxy # The elements within proxy_config are rendered as config.yaml for the proxy
# Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml # Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
# Reference: https://docs.litellm.ai/docs/proxy/configs # Reference: https://docs.litellm.ai/docs/proxy/configs

View file

@ -20,10 +20,18 @@ services:
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file: env_file:
- .env # Load local .env file - .env # Load local .env file
depends_on:
- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
healthcheck: # Defines the health check configuration for the container
test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ] # Command to execute for health check
interval: 30s # Perform health check every 30 seconds
timeout: 10s # Health check command times out after 10 seconds
retries: 3 # Retry up to 3 times if health check fails
start_period: 40s # Wait 40 seconds after container start before beginning health checks
db: db:
image: postgres image: postgres:16
restart: always restart: always
environment: environment:
POSTGRES_DB: litellm POSTGRES_DB: litellm
@ -31,6 +39,8 @@ services:
POSTGRES_PASSWORD: dbpassword9090 POSTGRES_PASSWORD: dbpassword9090
ports: ports:
- "5432:5432" - "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data # Persists Postgres data across container restarts
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"] test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
interval: 1s interval: 1s
@ -53,6 +63,8 @@ services:
volumes: volumes:
prometheus_data: prometheus_data:
driver: local driver: local
postgres_data:
name: litellm_postgres_data # Named volume for Postgres data persistence
# ...rest of your docker-compose config if any # ...rest of your docker-compose config if any

View file

@ -0,0 +1,92 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# /v1/messages [BETA]
LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint.
This currently just supports the Anthropic API.
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ✅ | |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ✅ | |
| Streaming | ✅ | |
| Fallbacks | ✅ | between anthropic models |
| Loadbalancing | ✅ | between anthropic models |
Planned improvement:
- Vertex AI Anthropic support
- Bedrock Anthropic support
## Usage
<Tabs>
<TabItem label="PROXY" value="proxy">
1. Setup config.yaml
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: claude-3-7-sonnet-latest
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
-H 'content-type: application/json' \
-H 'x-api-key: $LITELLM_API_KEY' \
-H 'anthropic-version: 2023-06-01' \
-d '{
"model": "anthropic-claude",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "List 5 important events in the XIX century"
}
]
}
],
"max_tokens": 4096
}'
```
</TabItem>
<TabItem value="sdk" label="SDK">
```python
from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
import asyncio
import os
# set env
os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
# Call the handler
async def call():
response = await anthropic_messages(
messages=messages,
api_key=api_key,
model="claude-3-haiku-20240307",
max_tokens=100,
)
asyncio.run(call())
```
</TabItem>
</Tabs>

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Assistants API # /assistants
Covers Threads, Messages, Assistants. Covers Threads, Messages, Assistants.

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# [BETA] Batches API # /batches
Covers Batches, Files Covers Batches, Files

View file

@ -3,7 +3,13 @@ import TabItem from '@theme/TabItem';
# Prompt Caching # Prompt Caching
For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format: Supported Providers:
- OpenAI (`openai/`)
- Anthropic API (`anthropic/`)
- Bedrock (`bedrock/`, `bedrock/invoke/`, `bedrock/converse`) ([All models bedrock supports prompt caching on](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html))
- Deepseek API (`deepseek/`)
For the supported providers, LiteLLM follows the OpenAI prompt caching usage object format:
```bash ```bash
"usage": { "usage": {
@ -499,4 +505,4 @@ curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
</TabItem> </TabItem>
</Tabs> </Tabs>
This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

View file

@ -189,4 +189,138 @@ Expected Response
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>
## Explicitly specify image type
If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param.
```python
"image_url": {
"url": "gs://my-gs-image",
"format": "image/jpeg"
}
```
LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai).
For others (e.g. openai), it will be ignored.
<Tabs>
<TabItem label="SDK" value="sdk">
```python
import os
from litellm import completion
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "claude-3-7-sonnet-latest",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
<TabItem label="PROXY" value="proxy">
1. Define vision models on config.yaml
```yaml
model_list:
- model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
litellm_params:
model: openai/gpt-4-vision-preview
api_key: os.environ/OPENAI_API_KEY
- model_name: llava-hf # Custom OpenAI compatible model
litellm_params:
model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
api_base: http://localhost:8000
api_key: fake-key
model_info:
supports_vision: True # set supports_vision to True so /model/info returns this attribute as True
```
2. Run proxy server
```bash
litellm --config config.yaml
```
3. Test it using the OpenAI Python SDK
```python
import os
from openai import OpenAI
client = OpenAI(
api_key="sk-1234", # your litellm proxy api key
)
response = client.chat.completions.create(
model = "gpt-4-vision-preview", # use model="llava-hf" to test your custom OpenAI endpoint
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
</Tabs>
## Spec
```
"image_url": str
OR
"image_url": {
"url": "url OR base64 encoded str",
"detail": "openai-only param",
"format": "specify mime-type of image"
}
```

View file

@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
|-------------------|-------------------------------------------------------------------------------------------------| |-------------------|-------------------------------------------------------------------------------------------------|
| SOC 2 Type I | Certified. Report available upon request on Enterprise plan. | | SOC 2 Type I | Certified. Report available upon request on Enterprise plan. |
| SOC 2 Type II | In progress. Certificate available by April 15th, 2025 | | SOC 2 Type II | In progress. Certificate available by April 15th, 2025 |
| ISO27001 | In progress. Certificate available by February 7th, 2025 | | ISO 27001 | Certified. Report available upon request on Enterprise |
## Supported Data Regions for LiteLLM Cloud ## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
Has the Vendor been audited / certified? Has the Vendor been audited / certified?
- SOC 2 Type I. Certified. Report available upon request on Enterprise plan. - SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
- SOC 2 Type II. In progress. Certificate available by April 15th, 2025. - SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025. - ISO 27001. Certified. Report available upon request on Enterprise plan.
Has an information security management system been implemented? Has an information security management system been implemented?
- Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains. - Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Embeddings # /embeddings
## Quick Start ## Quick Start
```python ```python

View file

@ -34,9 +34,9 @@ You can use our cloud product where we setup a dedicated instance for you.
Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting. We cant solve your own infrastructure-related issues but we will guide you to fix them. Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting. We cant solve your own infrastructure-related issues but we will guide you to fix them.
- 1 hour for Sev0 issues - 1 hour for Sev0 issues - 100% production traffic is failing
- 6 hours for Sev1 - 6 hours for Sev1 - <100% production traffic is failing
- 24h for Sev2-Sev3 between 7am 7pm PT (Monday through Saturday) - 24h for Sev2-Sev3 between 7am 7pm PT (Monday through Saturday) - setup issues e.g. Redis working on our end, but not on your infrastructure.
- 72h SLA for patching vulnerabilities in the software. - 72h SLA for patching vulnerabilities in the software.
**We can offer custom SLAs** based on your needs and the severity of the issue **We can offer custom SLAs** based on your needs and the severity of the issue

View file

@ -0,0 +1,106 @@
# Contributing Code
## **Checklist before submitting a PR**
Here are the core requirements for any PR submitted to LiteLLM
- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
- [ ] Ensure your PR passes the following tests:
- [ ] [Unit Tests](#3-running-unit-tests)
- [ ] [Formatting / Linting Tests](#35-running-linting-tests)
- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
## Quick start
## 1. Setup your local dev environment
Here's how to modify the repo locally:
Step 1: Clone the repo
```shell
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Install dev dependencies:
```shell
poetry install --with dev --extras proxy
```
That's it, your local dev environment is ready!
## 2. Adding Testing to your PR
- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
- Do not add real llm api calls to this directory.
### 2.1 File Naming Convention for `tests/litellm/`
The `tests/litellm/` directory follows the same directory structure as `litellm/`.
- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
- `test_{filename}.py` maps to `litellm/{filename}.py`
## 3. Running Unit Tests
run the following command on the root of the litellm directory
```shell
make test-unit
```
## 3.5 Running Linting Tests
run the following command on the root of the litellm directory
```shell
make lint
```
LiteLLM uses mypy for linting. On ci/cd we also run `black` for formatting.
## 4. Submit a PR with your changes!
- push your fork to your GitHub repo
- submit a PR from there
## Advanced
### Building LiteLLM Docker Image
Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
Step 1: Clone the repo
```shell
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Build the Docker Image
Build using Dockerfile.non_root
```shell
docker build -f docker/Dockerfile.non_root -t litellm_test_image .
```
Step 3: Run the Docker Image
Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
```shell
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://xxxxxxxx" \
-e LITELLM_MASTER_KEY="sk-1234" \
-p 4000:4000 \
litellm_test_image \
--config /app/config.yaml --detailed_debug
```

View file

@ -2,7 +2,7 @@
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
# Files API # /files
Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API. Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# [Beta] Fine-tuning API # /fine_tuning
:::info :::info

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Moderation # /moderations
### Usage ### Usage

View file

@ -78,6 +78,9 @@ Following are the allowed fields in metadata, their types, and their description
* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary). * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response. * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message. * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
* `user_feedback: Optional[str]` - The end users feedback.
* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
* `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference. * `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.
## Using a self hosted deployment of Athina ## Using a self hosted deployment of Athina

View file

@ -0,0 +1,5 @@
PDL - A YAML-based approach to prompt programming
Github: https://github.com/IBM/prompt-declaration-language
PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.

View file

@ -0,0 +1,9 @@
# pgai
[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
If you're already familiar with pgai, you can find litellm specific docs here:
- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Infinity # Infinity
| Property | Details | | Property | Details |
@ -12,6 +15,9 @@
```python ```python
from litellm import rerank from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank( response = rerank(
model="infinity/rerank", model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
``` ```
## Supported Cohere Rerank API Params
| Param | Type | Description |
|-------|-------|-------|
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
### Usage - Return Documents
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
return_documents=True,
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of France?",
"documents": [
"Paris",
"London",
"Berlin",
"Madrid"
],
"return_documents": True,
}'
```
</TabItem>
</Tabs>
## Pass Provider-specific Params
Any unmapped params will be passed to the provider as-is.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: custom-infinity-rerank
litellm_params:
model: infinity/rerank
api_base: https://localhost:8080
raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
```
2. Start litellm
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of the United States?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. is the capital of the United States.",
"Capital punishment has existed in the United States since before it was a country."
],
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}'
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,90 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Snowflake
| Property | Details |
|-------|-------|
| Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE function via HTTP POST requests|
| Provider Route on LiteLLM | `snowflake/` |
| Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) |
| Base URL | [https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete/](https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete) |
| Supported OpenAI Endpoints | `/chat/completions`, `/completions` |
Currently, Snowflake's REST API does not have an endpoint for `snowflake-arctic-embed` embedding models. If you want to use these embedding models with Litellm, you can call them through our Hugging Face provider.
Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake/arctic-embed-661fd57d50fab5fc314e4c18) on Hugging Face.
## Supported OpenAI Parameters
```
"temperature",
"max_tokens",
"top_p",
"response_format"
```
## API KEYS
Snowflake does have API keys. Instead, you access the Snowflake API with your JWT token and account identifier.
```python
import os
os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
```
## Usage
```python
from litellm import completion
## set ENV variables
os.environ["SNOWFLAKE_JWT"] = "YOUR JWT"
os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER"
# Snowflake call
response = completion(
model="snowflake/mistral-7b",
messages = [{ "content": "Hello, how are you?","role": "user"}]
)
```
## Usage with LiteLLM Proxy
#### 1. Required env variables
```bash
export SNOWFLAKE_JWT=""
export SNOWFLAKE_ACCOUNT_ID = ""
```
#### 2. Start the proxy~
```yaml
model_list:
- model_name: mistral-7b
litellm_params:
model: snowflake/mistral-7b
api_key: YOUR_API_KEY
api_base: https://YOUR-ACCOUNT-ID.snowflakecomputing.com/api/v2/cortex/inference:complete
```
```bash
litellm --config /path/to/config.yaml
```
#### 3. Test it
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "snowflake/mistral-7b",
"messages": [
{
"role": "user",
"content": "Hello, how are you?"
}
]
}
'
```

View file

@ -404,14 +404,16 @@ curl http://localhost:4000/v1/chat/completions \
If this was your initial VertexAI Grounding code, If this was your initial VertexAI Grounding code,
```python ```python
import vertexai import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
vertexai.init(project=project_id, location="us-central1") vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel("gemini-1.5-flash-001") model = GenerativeModel("gemini-1.5-flash-001")
# Use Google Search for grounding # Use Google Search for grounding
tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False)) tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())
prompt = "When is the next total solar eclipse in US?" prompt = "When is the next total solar eclipse in US?"
response = model.generate_content( response = model.generate_content(
@ -428,7 +430,7 @@ print(response)
then, this is what it looks like now then, this is what it looks like now
```python ```python
from litellm import completion from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env # !gcloud auth application-default login - run this to add vertex credentials to your env
@ -852,6 +854,7 @@ litellm.vertex_location = "us-central1 # Your Location
| claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` | | claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` | | claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` | | claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
| claude-3-7-sonnet@20250219 | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |
### Usage ### Usage
@ -926,6 +929,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</Tabs> </Tabs>
### Usage - `thinking` / `reasoning_content`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="vertex_ai/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: vertex_ai/claude-3-7-sonnet-20250219
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-7-sonnet-20250219",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
provider_specific_fields={
'citations': None,
'thinking_blocks': [
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6...'
}
]
}
),
thinking_blocks=[
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6AGB...'
}
],
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
## Llama 3 API ## Llama 3 API
| Model Name | Function Call | | Model Name | Function Call |
@ -1572,6 +1688,14 @@ assert isinstance(
Pass any file supported by Vertex AI, through LiteLLM. Pass any file supported by Vertex AI, through LiteLLM.
LiteLLM Supports the following image types passed in url
```
Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
Base64 Encoded Local Images
```
<Tabs> <Tabs>
<TabItem value="sdk" label="SDK"> <TabItem value="sdk" label="SDK">

View file

@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
</TabItem> </TabItem>
</Tabs> </Tabs>
## Send Video URL to VLLM
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "video_url", "video_url": {"url": video_url}},
```
2. Pass the video data as base64
```
{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "video_url",
"video_url": {
"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
],
api_base="https://hosted-vllm-api.co")
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package` ## (Deprecated) for `vllm pip package`
### Using - `litellm.completion` ### Using - `litellm.completion`

View file

@ -10,17 +10,13 @@ Role-based access control (RBAC) is based on Organizations, Teams and Internal U
## Roles ## Roles
**Admin Roles** | Role Type | Role Name | Permissions |
- `proxy_admin`: admin over the platform |-----------|-----------|-------------|
- `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users | **Admin** | `proxy_admin` | Admin over the platform |
| | `proxy_admin_viewer` | Can login, view all keys, view all spend. **Cannot** create keys/delete keys/add new users |
**Organization Roles** | **Organization** | `org_admin` | Admin over the organization. Can create teams and users within their organization |
- `org_admin`: admin over the organization. Can create teams and users within their organization | **Internal User** | `internal_user` | Can login, view/create/delete their own keys, view their spend. **Cannot** add new users |
| | `internal_user_viewer` | Can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users |
**Internal User Roles**
- `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
- `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
## Onboarding Organizations ## Onboarding Organizations

View file

@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
- Virtual Key Rate Limit - Virtual Key Rate Limit
- User Rate Limit - User Rate Limit
- Team Limit - Team Limit
- The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172) - The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
## Frequently Asked Questions ## Frequently Asked Questions

View file

@ -499,6 +499,7 @@ router_settings:
| SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth) | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
| SPEND_LOGS_URL | URL for retrieving spend logs | SPEND_LOGS_URL | URL for retrieving spend logs
| SSL_CERTIFICATE | Path to the SSL certificate file | SSL_CERTIFICATE | Path to the SSL certificate file
| SSL_SECURITY_LEVEL | [BETA] Security level for SSL/TLS connections. E.g. `DEFAULT@SECLEVEL=1`
| SSL_VERIFY | Flag to enable or disable SSL certificate verification | SSL_VERIFY | Flag to enable or disable SSL certificate verification
| SUPABASE_KEY | API key for Supabase service | SUPABASE_KEY | API key for Supabase service
| SUPABASE_URL | Base URL for Supabase instance | SUPABASE_URL | Base URL for Supabase instance

View file

@ -448,6 +448,34 @@ model_list:
s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for helping with this. s/o to [@David Manouchehri](https://www.linkedin.com/in/davidmanouchehri/) for helping with this.
### Centralized Credential Management
Define credentials once and reuse them across multiple models. This helps with:
- Secret rotation
- Reducing config duplication
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: azure/gpt-4o
litellm_credential_name: default_azure_credential # Reference credential below
credential_list:
- credential_name: default_azure_credential
credential_values:
api_key: os.environ/AZURE_API_KEY # Load from environment
api_base: os.environ/AZURE_API_BASE
api_version: "2023-05-15"
credential_info:
description: "Production credentials for EU region"
```
#### Key Parameters
- `credential_name`: Unique identifier for the credential set
- `credential_values`: Key-value pairs of credentials/secrets (supports `os.environ/` syntax)
- `credential_info`: Key-value pairs of user provided credentials information. No key-value pairs are required, but the dictionary must exist.
### Load API Keys from Secret Managers (Azure Vault, etc) ### Load API Keys from Secret Managers (Azure Vault, etc)
[**Using Secret Managers with LiteLLM Proxy**](../secret) [**Using Secret Managers with LiteLLM Proxy**](../secret)
@ -641,4 +669,4 @@ docker run --name litellm-proxy \
ghcr.io/berriai/litellm-database:main-latest ghcr.io/berriai/litellm-database:main-latest
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>

View file

@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma
| Table Name | Description | Row Insert Frequency | | Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------| |------------|-------------|---------------------|
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** | | LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** | | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |
## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs` ## Disable `LiteLLM_SpendLogs`
You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file. You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
```yaml ```yaml
general_settings: general_settings:
disable_spend_logs: True # Disable writing spend logs to DB disable_spend_logs: True # Disable writing spend logs to DB
disable_error_logs: True # Disable writing error logs to DB disable_error_logs: True # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
``` ```
### What is the impact of disabling these logs? ### What is the impact of disabling these logs?

View file

@ -37,7 +37,7 @@ guardrails:
- guardrail_name: aim-protected-app - guardrail_name: aim-protected-app
litellm_params: litellm_params:
guardrail: aim guardrail: aim
mode: pre_call # 'during_call' is also available mode: [pre_call, post_call] # "During_call" is also available
api_key: os.environ/AIM_API_KEY api_key: os.environ/AIM_API_KEY
api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
``` ```

View file

@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
| `api_base` | `Optional[str]` | Optional API base URL | | `api_base` | `Optional[str]` | Optional API base URL |
| `response_cost` | `Optional[str]` | Optional response cost | | `response_cost` | `Optional[str]` | Optional response cost |
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers | | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
## StandardLoggingModelInformation ## StandardLoggingModelInformation

View file

@ -0,0 +1,53 @@
# Rotating Master Key
Here are our recommended steps for rotating your master key.
**1. Backup your DB**
In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
**2. Call `/key/regenerate` with the new master key**
```bash
curl -L -X POST 'http://localhost:4000/key/regenerate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"key": "sk-1234",
"new_master_key": "sk-PIp1h0RekR"
}'
```
This will re-encrypt any models in your Proxy_ModelTable with the new master key.
Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
```bash
raise Exception("Unable to decrypt value={}".format(v))
Exception: Unable to decrypt value=<new-encrypted-value>
```
**3. Update LITELLM_MASTER_KEY**
In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
This ensures the key used for decryption from db is the new key.
**4. Test it**
Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
"messages": [
{
"content": "Hey, how's it going",
"role": "user"
}
],
}'
```

View file

@ -107,9 +107,9 @@ general_settings:
By default, LiteLLM writes several types of logs to the database: By default, LiteLLM writes several types of logs to the database:
- Every LLM API request to the `LiteLLM_SpendLogs` table - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table - LLM Exceptions to the `LiteLLM_SpendLogs` table
If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`: If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:
```yaml ```yaml
general_settings: general_settings:

View file

@ -0,0 +1,12 @@
# Release Cycle
Litellm Proxy has the following release cycle:
- `v1.x.x-nightly`: These are releases which pass ci/cd.
- `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
In production, we recommend using the latest `v1.x.x` release.
Follow our release notes [here](https://github.com/BerriAI/litellm/releases).

View file

@ -102,7 +102,19 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
</TabItem> </TabItem>
</Tabs> </Tabs>
## Advanced - Set Accepted JWT Scope Names ## Advanced
### Multiple OIDC providers
Use this if you want LiteLLM to validate your JWT against multiple OIDC providers (e.g. Google Cloud, GitHub Auth)
Set `JWT_PUBLIC_KEY_URL` in your environment to a comma-separated list of URLs for your OIDC providers.
```bash
export JWT_PUBLIC_KEY_URL="https://demo.duendesoftware.com/.well-known/openid-configuration/jwks,https://accounts.google.com/.well-known/openid-configuration/jwks"
```
### Set Accepted JWT Scope Names
Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access. Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.
@ -114,7 +126,7 @@ general_settings:
admin_jwt_scope: "litellm-proxy-admin" admin_jwt_scope: "litellm-proxy-admin"
``` ```
## Tracking End-Users / Internal Users / Team / Org ### Tracking End-Users / Internal Users / Team / Org
Set the field in the jwt token, which corresponds to a litellm user / team / org. Set the field in the jwt token, which corresponds to a litellm user / team / org.
@ -156,7 +168,7 @@ scope: ["litellm-proxy-admin",...]
scope: "litellm-proxy-admin ..." scope: "litellm-proxy-admin ..."
``` ```
## Control model access with Teams ### Control model access with Teams
1. Specify the JWT field that contains the team ids, that the user belongs to. 1. Specify the JWT field that contains the team ids, that the user belongs to.
@ -207,11 +219,11 @@ OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a
- If all checks pass, allow the request - If all checks pass, allow the request
## Advanced - Custom Validate ### Custom JWT Validate
Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy. Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.
### 1. Setup custom validate function #### 1. Setup custom validate function
```python ```python
from typing import Literal from typing import Literal
@ -230,7 +242,7 @@ def my_custom_validate(token: str) -> Literal[True]:
return True return True
``` ```
### 2. Setup config.yaml #### 2. Setup config.yaml
```yaml ```yaml
general_settings: general_settings:
@ -243,7 +255,7 @@ general_settings:
custom_validate: custom_validate.my_custom_validate # 👈 custom validate function custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
``` ```
### 3. Test the flow #### 3. Test the flow
**Expected JWT** **Expected JWT**
@ -265,7 +277,7 @@ general_settings:
## Advanced - Allowed Routes ### Allowed Routes
Configure which routes a JWT can access via the config. Configure which routes a JWT can access via the config.
@ -297,7 +309,7 @@ general_settings:
team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
``` ```
## Advanced - Caching Public Keys ### Caching Public Keys
Control how long public keys are cached for (in seconds). Control how long public keys are cached for (in seconds).
@ -311,7 +323,7 @@ general_settings:
public_key_ttl: 600 # 👈 KEY CHANGE public_key_ttl: 600 # 👈 KEY CHANGE
``` ```
## Advanced - Custom JWT Field ### Custom JWT Field
Set a custom field in which the team_id exists. By default, the 'client_id' field is checked. Set a custom field in which the team_id exists. By default, the 'client_id' field is checked.
@ -323,14 +335,7 @@ general_settings:
team_id_jwt_field: "client_id" # 👈 KEY CHANGE team_id_jwt_field: "client_id" # 👈 KEY CHANGE
``` ```
## All Params ### Block Teams
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
## Advanced - Block Teams
To block all requests for a certain team id, use `/team/block` To block all requests for a certain team id, use `/team/block`
@ -357,7 +362,7 @@ curl --location 'http://0.0.0.0:4000/team/unblock' \
``` ```
## Advanced - Upsert Users + Allowed Email Domains ### Upsert Users + Allowed Email Domains
Allow users who belong to a specific email domain, automatic access to the proxy. Allow users who belong to a specific email domain, automatic access to the proxy.
@ -494,4 +499,10 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
} }
] ]
}' }'
``` ```
## All JWT Params
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)

View file

@ -0,0 +1,55 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Adding LLM Credentials
You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
## Add a credential + model
### 1. Navigate to LLM Credentials page
Go to Models -> LLM Credentials -> Add Credential
<Image img={require('../../img/ui_cred_add.png')} />
### 2. Add credentials
Select your LLM provider, enter your API Key and click "Add Credential"
**Note: Credentials are based on the provider, if you select Vertex AI then you will see `Vertex Project`, `Vertex Location` and `Vertex Credentials` fields**
<Image img={require('../../img/ui_add_cred_2.png')} />
### 3. Use credentials when adding a model
Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
<Image img={require('../../img/ui_cred_3.png')} />
## Create a Credential from an existing model
Use this if you have already created a model and want to store the model credentials for future use
### 1. Select model to create a credential from
Go to Models -> Select your model -> Credential -> Create Credential
<Image img={require('../../img/ui_cred_4.png')} />
### 2. Use new credential when adding a model
Go to Add Model -> Existing Credentials -> Select your credential in the dropdown
<Image img={require('../../img/use_model_cred.png')} />
## Frequently Asked Questions
How are credentials stored?
Credentials in the DB are encrypted/decrypted using `LITELLM_SALT_KEY`, if set. If not, then they are encrypted using `LITELLM_MASTER_KEY`. These keys should be kept secret and not shared with others.

View file

@ -0,0 +1,55 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# UI Logs Page
View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM
<Image img={require('../../img/ui_request_logs.png')}/>
## Overview
| Log Type | Tracked by Default |
|----------|-------------------|
| Success Logs | ✅ Yes |
| Error Logs | ✅ Yes |
| Request/Response Content Stored | ❌ No by Default, **opt in with `store_prompts_in_spend_logs`** |
**By default LiteLLM does not track the request and response content.**
## Tracking - Request / Response Content in Logs Page
If you want to view request and response content on LiteLLM Logs, you need to opt in with this setting
```yaml
general_settings:
store_prompts_in_spend_logs: true
```
<Image img={require('../../img/ui_request_logs_content.png')}/>
## Stop storing Error Logs in DB
If you do not want to store error logs in DB, you can opt out with this setting
```yaml
general_settings:
disable_error_logs: True # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
```
## Stop storing Spend Logs in DB
If you do not want to store spend logs in DB, you can opt out with this setting
```yaml
general_settings:
disable_spend_logs: True # Disable writing spend logs to DB
```

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Realtime Endpoints # /realtime
Use this to loadbalance across Azure + OpenAI. Use this to loadbalance across Azure + OpenAI.

View file

@ -3,11 +3,20 @@ import TabItem from '@theme/TabItem';
# 'Thinking' / 'Reasoning Content' # 'Thinking' / 'Reasoning Content'
:::info
Requires LiteLLM v1.63.0+
:::
Supported Providers: Supported Providers:
- Deepseek (`deepseek/`) - Deepseek (`deepseek/`)
- Anthropic API (`anthropic/`) - Anthropic API (`anthropic/`)
- Bedrock (Anthropic) (`bedrock/`) - Bedrock (Anthropic + Deepseek) (`bedrock/`)
- Vertex AI (Anthropic) (`vertexai/`) - Vertex AI (Anthropic) (`vertexai/`)
- OpenRouter (`openrouter/`)
LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.
```python ```python
"message": { "message": {
@ -17,7 +26,7 @@ Supported Providers:
{ {
"type": "thinking", "type": "thinking",
"thinking": "The capital of France is Paris.", "thinking": "The capital of France is Paris.",
"signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
} }
] ]
} }
@ -95,13 +104,263 @@ curl http://0.0.0.0:4000/v1/chat/completions \
} }
``` ```
## Tool Calling with `thinking`
Here's how to use `thinking` blocks by Anthropic with tool calling.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
litellm._turn_on_debug()
litellm.modify_params = True
model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "user",
"content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
}
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model=model,
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
thinking={"type": "enabled", "budget_tokens": 1024},
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
print("Expecting there to be 3 tool calls")
assert (
len(tool_calls) > 0
) # this has to call the function for SF, Tokyo and paris
# Step 2: check if the model wanted to call a function
print(f"tool_calls: {tool_calls}")
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
} # only one function in this example, but you can have multiple
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
if function_name not in available_functions:
# the model called a function that does not exist in available_functions - don't try calling anything
return
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model=model,
messages=messages,
seed=22,
# tools=tools,
drop_params=True,
thinking={"type": "enabled", "budget_tokens": 1024},
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-7-sonnet-thinking
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
thinking: {
"type": "enabled",
"budget_tokens": 1024
}
```
2. Run proxy
```bash
litellm --config config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Make 1st call
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
],
"tool_choice": "auto"
}'
```
4. Make 2nd call with tool call results
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{
"role": "user",
"content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
},
{
"role": "assistant",
"content": "I\'ll check the current weather for these three cities for you:",
"tool_calls": [
{
"index": 2,
"function": {
"arguments": "{\"location\": \"San Francisco\"}",
"name": "get_current_weather"
},
"id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"type": "function"
}
],
"function_call": null,
"reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
}
],
"provider_specific_fields": {
"reasoningContentBlocks": [
{
"reasoningText": {
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
"text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
}
}
]
}
},
{
"tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"role": "tool",
"name": "get_current_weather",
"content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
}
]
}'
```
</TabItem>
</Tabs>
## Switching between Anthropic + Deepseek models
Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
```python
litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
# or per request
## Anthropic
response = litellm.completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
## Deepseek
response = litellm.completion(
model="deepseek/deepseek-chat",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
```
## Spec ## Spec
These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`. These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
- `reasoning_content` - str: The reasoning content from the model. Returned across all providers. - `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
- `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models. - `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
- `type` - str: The type of thinking block. - `type` - str: The type of thinking block.
- `thinking` - str: The thinking from the model. - `thinking` - str: The thinking from the model.
- `signature_delta` - str: The signature delta from the model. - `signature` - str: The signature delta from the model.

View file

@ -1,4 +1,4 @@
# Rerank # /rerank
:::tip :::tip

View file

@ -0,0 +1,117 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# /responses [Beta]
LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](https://platform.openai.com/docs/api-reference/responses)
| Feature | Supported | Notes |
|---------|-----------|--------|
| Cost Tracking | ✅ | Works with all supported models |
| Logging | ✅ | Works across all integrations |
| End-user Tracking | ✅ | |
| Streaming | ✅ | |
| Fallbacks | ✅ | Works between supported models |
| Loadbalancing | ✅ | Works between supported models |
| Supported LiteLLM Versions | 1.63.8+ | |
| Supported LLM providers | `openai` | |
## Usage
## Create a model response
<Tabs>
<TabItem value="litellm-sdk" label="LiteLLM SDK">
#### Non-streaming
```python
import litellm
# Non-streaming response
response = litellm.responses(
model="gpt-4o",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python
import litellm
# Streaming response
response = litellm.responses(
model="gpt-4o",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
First, add this to your litellm proxy config.yaml:
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
#### Non-streaming
```python
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="gpt-4o",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="gpt-4o",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>

View file

@ -830,7 +830,7 @@ asyncio.run(router_acompletion())
Set `weight` on a deployment to pick one deployment more often than others. Set `weight` on a deployment to pick one deployment more often than others.
This works across **ALL** routing strategies. This works across **simple-shuffle** routing strategy (this is the default, if no routing strategy is selected).
<Tabs> <Tabs>
<TabItem value="sdk" label="SDK"> <TabItem value="sdk" label="SDK">
@ -952,8 +952,8 @@ router_settings:
``` ```
Defaults: Defaults:
- allowed_fails: 0 - allowed_fails: 3
- cooldown_time: 60s - cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)
**Set Per Model** **Set Per Model**

View file

@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
``` ```
#### Using K/V pairs in 1 AWS Secret
You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
```yaml
general_settings:
key_management_system: "aws_secret_manager"
key_management_settings:
hosted_keys: [
"OPENAI_API_KEY_MODEL_1",
"OPENAI_API_KEY_MODEL_2",
]
primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
```
The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
```json
{
"OPENAI_API_KEY_MODEL_1": "sk-key1...",
"OPENAI_API_KEY_MODEL_2": "sk-key2..."
}
```
This reduces the number of AWS Secrets you need to manage.
## Hashicorp Vault ## Hashicorp Vault
@ -353,4 +380,7 @@ general_settings:
# Hosted Keys Settings # Hosted Keys Settings
hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
# K/V pairs in 1 AWS Secret Settings
primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
``` ```

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Text Completion # /completions
### Usage ### Usage
<Tabs> <Tabs>

View file

@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Use LiteLLM AI Gateway with Aporia Guardrails # Aporia Guardrails with LiteLLM Gateway
In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses
## 1. Setup guardrails on Aporia ## 1. Setup guardrails on Aporia

View file

@ -0,0 +1,103 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenWeb UI with LiteLLM
This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to
- Access 100+ LLMs on OpenWeb UI
- Track Spend / Usage, Set Budget Limits
- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
- Set access controls eg. Control what models OpenWebUI can access.
## Quickstart
- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
## 1. Start LiteLLM & OpenWebUI
- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
## 2. Create a Virtual Key on LiteLLM
Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
### 2.1 LiteLLM User Management Hierarchy
On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
### 2.2 Create a Team on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
<Image img={require('../../img/litellm_create_team.gif')} />
### 2.2 Create a Virtual Key on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key.
LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
<Image img={require('../../img/create_key_in_team_oweb.gif')} />
## 3. Connect OpenWeb UI to LiteLLM
On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
Enter the following details:
- URL: `http://localhost:4000` (your litellm proxy base url)
- Key: `your-virtual-key` (the key you created in the previous step)
<Image img={require('../../img/litellm_setup_openweb.gif')} />
### 3.1 Test Request
On the top left corner, select models you should only see the models you gave the key access to in Step 2.
Once you selected a model, enter your message content and click on `Submit`
<Image img={require('../../img/basic_litellm.gif')} />
### 3.2 Tracking Spend / Usage
After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
## Render `thinking` content on OpenWeb UI
OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
Example litellm config.yaml:
```yaml
model_list:
- model_name: thinking-anthropic-claude-3-7-sonnet
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024}
max_tokens: 1080
merge_reasoning_content_in_choices: true
```
### Test it on OpenWeb UI
On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
<Image img={require('../../img/litellm_thinking_openweb.gif')} />

View file

@ -44,7 +44,7 @@ const config = {
path: './release_notes', path: './release_notes',
routeBasePath: 'release_notes', routeBasePath: 'release_notes',
blogTitle: 'Release Notes', blogTitle: 'Release Notes',
blogSidebarTitle: 'All Releases', blogSidebarTitle: 'Releases',
blogSidebarCount: 'ALL', blogSidebarCount: 'ALL',
postsPerPage: 'ALL', postsPerPage: 'ALL',
showReadingTime: false, showReadingTime: false,

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 470 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 371 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 918 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 386 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 255 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 283 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 255 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 567 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

View file

@ -706,12 +706,13 @@
} }
}, },
"node_modules/@babel/helpers": { "node_modules/@babel/helpers": {
"version": "7.26.0", "version": "7.26.10",
"resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz", "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz",
"integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==", "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==",
"license": "MIT",
"dependencies": { "dependencies": {
"@babel/template": "^7.25.9", "@babel/template": "^7.26.9",
"@babel/types": "^7.26.0" "@babel/types": "^7.26.10"
}, },
"engines": { "engines": {
"node": ">=6.9.0" "node": ">=6.9.0"
@ -796,11 +797,12 @@
} }
}, },
"node_modules/@babel/parser": { "node_modules/@babel/parser": {
"version": "7.26.3", "version": "7.26.10",
"resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.3.tgz", "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.10.tgz",
"integrity": "sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==", "integrity": "sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==",
"license": "MIT",
"dependencies": { "dependencies": {
"@babel/types": "^7.26.3" "@babel/types": "^7.26.10"
}, },
"bin": { "bin": {
"parser": "bin/babel-parser.js" "parser": "bin/babel-parser.js"
@ -2157,9 +2159,10 @@
} }
}, },
"node_modules/@babel/runtime-corejs3": { "node_modules/@babel/runtime-corejs3": {
"version": "7.26.0", "version": "7.26.10",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.0.tgz", "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz",
"integrity": "sha512-YXHu5lN8kJCb1LOb9PgV6pvak43X2h4HvRApcN5SdWeaItQOzfn1hgP6jasD6KWQyJDBxrVmA9o9OivlnNJK/w==", "integrity": "sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==",
"license": "MIT",
"dependencies": { "dependencies": {
"core-js-pure": "^3.30.2", "core-js-pure": "^3.30.2",
"regenerator-runtime": "^0.14.0" "regenerator-runtime": "^0.14.0"
@ -2169,13 +2172,14 @@
} }
}, },
"node_modules/@babel/template": { "node_modules/@babel/template": {
"version": "7.25.9", "version": "7.26.9",
"resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz", "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.26.9.tgz",
"integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==", "integrity": "sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==",
"license": "MIT",
"dependencies": { "dependencies": {
"@babel/code-frame": "^7.25.9", "@babel/code-frame": "^7.26.2",
"@babel/parser": "^7.25.9", "@babel/parser": "^7.26.9",
"@babel/types": "^7.25.9" "@babel/types": "^7.26.9"
}, },
"engines": { "engines": {
"node": ">=6.9.0" "node": ">=6.9.0"
@ -2199,9 +2203,10 @@
} }
}, },
"node_modules/@babel/types": { "node_modules/@babel/types": {
"version": "7.26.3", "version": "7.26.10",
"resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.3.tgz", "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.10.tgz",
"integrity": "sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==", "integrity": "sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==",
"license": "MIT",
"dependencies": { "dependencies": {
"@babel/helper-string-parser": "^7.25.9", "@babel/helper-string-parser": "^7.25.9",
"@babel/helper-validator-identifier": "^7.25.9" "@babel/helper-validator-identifier": "^7.25.9"

View file

@ -18,13 +18,6 @@ hide_table_of_contents: false
`alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch` `alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`
:::note
v1.57.8-stable, is currently being tested. It will be released on 2025-01-12.
:::
## New / Updated Models ## New / Updated Models
1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452 1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452

View file

@ -0,0 +1,103 @@
---
title: v1.61.20-stable
slug: v1.61.20-stable
date: 2025-03-01T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
# v1.61.20-stable
These are the changes since `v1.61.13-stable`.
This release is primarily focused on:
- LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
- UI improvements (add model flow, user management, etc)
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter)
1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
## LLM Translation
1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
2. Amazon Deepseek - `<think>` param extraction into reasoning_content [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
3. Amazon Titan Embeddings - filter out aws_ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
4. Anthropic thinking + reasoning_content translation support (Anthropic API, Bedrock, Vertex AI) [Start here](https://docs.litellm.ai/docs/reasoning_content)
5. VLLM - support video_url [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
9. O1/O3 - support drop_params for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
## Spend Tracking Improvements
1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
## Management Endpoints / UI
1. Models Page - Allow sorting models by created at
2. Models Page - Edit Model Flow Improvements
3. Models Page - Fix Adding Azure, Azure AI Studio models on UI
4. Internal Users Page - Allow Bulk Adding Internal Users on UI
5. Internal Users Page - Allow sorting users by created at
6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
8. Model Hub Page - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
9. Admin Settings Page - Allow adding MSFT SSO on UI
10. Backend - don't allow creating duplicate internal users in DB
## Helm
1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
## Logging / Guardrail Integrations
1. Arize Phoenix support
2. No-log - fix no-log param support on embedding calls
## Performance / Loadbalancing / Reliability improvements
1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
## General Proxy Improvements
1. Hypercorn - fix reading / parsing request body
2. Windows - fix running proxy in windows
3. DD-Trace - fix dd-trace enablement on proxy
## Complete Git Diff
View the complete git diff [here](https://github.com/BerriAI/litellm/compare/v1.61.13-stable...v1.61.20-stable).

View file

@ -0,0 +1,40 @@
---
title: v1.63.0 - Anthropic 'thinking' response update
slug: v1.63.0
date: 2025-03-05T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
## Diff
```bash
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris.",
- "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
+ "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
}
]
}
```

View file

@ -0,0 +1,180 @@
---
title: v1.63.11-stable
slug: v1.63.11-stable
date: 2025-03-15T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: [credential management, thinking content, responses api, snowflake]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
These are the changes since `v1.63.2-stable`.
This release is primarily focused on:
- [Beta] Responses API Support
- Snowflake Cortex Support, Amazon Nova Image Generation
- UI - Credential Management, re-use credentials when adding new models
- UI - Test Connection to LLM Provider before adding a model
:::info
This release will be live on 03/16/2025
:::
<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
## Known Issues
- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test
## Docker Run LiteLLM Proxy
```
docker run
-e STORE_MODEL_IN_DB=True
-p 4000:4000
ghcr.io/berriai/litellm:main-v1.63.11-stable
```
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
- Image Generation support for Amazon Nova Canvas [Getting Started](https://docs.litellm.ai/docs/providers/bedrock#image-generation)
- Add pricing for Jamba new models [PR](https://github.com/BerriAI/litellm/pull/9032/files)
- Add pricing for Amazon EU models [PR](https://github.com/BerriAI/litellm/pull/9056/files)
- Add Bedrock Deepseek R1 model pricing [PR](https://github.com/BerriAI/litellm/pull/9108/files)
- Update Gemini pricing: Gemma 3, Flash 2 thinking update, LearnLM [PR](https://github.com/BerriAI/litellm/pull/9190/files)
- Mark Cohere Embedding 3 models as Multimodal [PR](https://github.com/BerriAI/litellm/pull/9176/commits/c9a576ce4221fc6e50dc47cdf64ab62736c9da41)
- Add Azure Data Zone pricing [PR](https://github.com/BerriAI/litellm/pull/9185/files#diff-19ad91c53996e178c1921cbacadf6f3bae20cfe062bd03ee6bfffb72f847ee37)
- LiteLLM Tracks cost for `azure/eu` and `azure/us` models
## LLM Translation
<Image img={require('../../img/release_notes/responses_api.png')} />
1. **New Endpoints**
- [Beta] POST `/responses` API. [Getting Started](https://docs.litellm.ai/docs/response_api)
2. **New LLM Providers**
- Snowflake Cortex [Getting Started](https://docs.litellm.ai/docs/providers/snowflake)
3. **New LLM Features**
- Support OpenRouter `reasoning_content` on streaming [Getting Started](https://docs.litellm.ai/docs/reasoning_content)
4. **Bug Fixes**
- OpenAI: Return `code`, `param` and `type` on bad request error [More information on litellm exceptions](https://docs.litellm.ai/docs/exception_mapping)
- Bedrock: Fix converse chunk parsing to only return empty dict on tool use [PR](https://github.com/BerriAI/litellm/pull/9166)
- Bedrock: Support extra_headers [PR](https://github.com/BerriAI/litellm/pull/9113)
- Azure: Fix Function Calling Bug & Update Default API Version to `2025-02-01-preview` [PR](https://github.com/BerriAI/litellm/pull/9191)
- Azure: Fix AI services URL [PR](https://github.com/BerriAI/litellm/pull/9185)
- Vertex AI: Handle HTTP 201 status code in response [PR](https://github.com/BerriAI/litellm/pull/9193)
- Perplexity: Fix incorrect streaming response [PR](https://github.com/BerriAI/litellm/pull/9081)
- Triton: Fix streaming completions bug [PR](https://github.com/BerriAI/litellm/pull/8386)
- Deepgram: Support bytes.IO when handling audio files for transcription [PR](https://github.com/BerriAI/litellm/pull/9071)
- Ollama: Fix "system" role has become unacceptable [PR](https://github.com/BerriAI/litellm/pull/9261)
- All Providers (Streaming): Fix String `data:` stripped from entire content in streamed responses [PR](https://github.com/BerriAI/litellm/pull/9070)
## Spend Tracking Improvements
1. Support Bedrock converse cache token tracking [Getting Started](https://docs.litellm.ai/docs/completion/prompt_caching)
2. Cost Tracking for Responses API [Getting Started](https://docs.litellm.ai/docs/response_api)
3. Fix Azure Whisper cost tracking [Getting Started](https://docs.litellm.ai/docs/audio_transcription)
## UI
### Re-Use Credentials on UI
You can now onboard LLM provider credentials on LiteLLM UI. Once these credentials are added you can re-use them when adding new models [Getting Started](https://docs.litellm.ai/docs/proxy/ui_credentials)
<Image img={require('../../img/release_notes/credentials.jpg')} />
### Test Connections before adding models
Before adding a model you can test the connection to the LLM provider to verify you have setup your API Base + API Key correctly
<Image img={require('../../img/release_notes/litellm_test_connection.gif')} />
### General UI Improvements
1. Add Models Page
- Allow adding Cerebras, Sambanova, Perplexity, Fireworks, Openrouter, TogetherAI Models, Text-Completion OpenAI on Admin UI
- Allow adding EU OpenAI models
- Fix: Instantly show edit + deletes to models
2. Keys Page
- Fix: Instantly show newly created keys on Admin UI (don't require refresh)
- Fix: Allow clicking into Top Keys when showing users Top API Key
- Fix: Allow Filter Keys by Team Alias, Key Alias and Org
- UI Improvements: Show 100 Keys Per Page, Use full height, increase width of key alias
3. Users Page
- Fix: Show correct count of internal user keys on Users Page
- Fix: Metadata not updating in Team UI
4. Logs Page
- UI Improvements: Keep expanded log in focus on LiteLLM UI
- UI Improvements: Minor improvements to logs page
- Fix: Allow internal user to query their own logs
- Allow switching off storing Error Logs in DB [Getting Started](https://docs.litellm.ai/docs/proxy/ui_logs)
5. Sign In/Sign Out
- Fix: Correctly use `PROXY_LOGOUT_URL` when set [Getting Started](https://docs.litellm.ai/docs/proxy/self_serve#setting-custom-logout-urls)
## Security
1. Support for Rotating Master Keys [Getting Started](https://docs.litellm.ai/docs/proxy/master_key_rotations)
2. Fix: Internal User Viewer Permissions, don't allow `internal_user_viewer` role to see `Test Key Page` or `Create Key Button` [More information on role based access controls](https://docs.litellm.ai/docs/proxy/access_control)
3. Emit audit logs on All user + model Create/Update/Delete endpoints [Getting Started](https://docs.litellm.ai/docs/proxy/multiple_admins)
4. JWT
- Support multiple JWT OIDC providers [Getting Started](https://docs.litellm.ai/docs/proxy/token_auth)
- Fix JWT access with Groups not working when team is assigned All Proxy Models access
5. Using K/V pairs in 1 AWS Secret [Getting Started](https://docs.litellm.ai/docs/secret#using-kv-pairs-in-1-aws-secret)
## Logging Integrations
1. Prometheus: Track Azure LLM API latency metric [Getting Started](https://docs.litellm.ai/docs/proxy/prometheus#request-latency-metrics)
2. Athina: Added tags, user_feedback and model_options to additional_keys which can be sent to Athina [Getting Started](https://docs.litellm.ai/docs/observability/athina_integration)
## Performance / Reliability improvements
1. Redis + litellm router - Fix Redis cluster mode for litellm router [PR](https://github.com/BerriAI/litellm/pull/9010)
## General Improvements
1. OpenWebUI Integration - display `thinking` tokens
- Guide on getting started with LiteLLM x OpenWebUI. [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui)
- Display `thinking` tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) [Getting Started](https://docs.litellm.ai/docs/tutorials/openweb_ui#render-thinking-content-on-openweb-ui)
<Image img={require('../../img/litellm_thinking_openweb.gif')} />
## Complete Git Diff
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.2-stable...v1.63.11-stable)

View file

@ -0,0 +1,112 @@
---
title: v1.63.2-stable
slug: v1.63.2-stable
date: 2025-03-08T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
These are the changes since `v1.61.20-stable`.
This release is primarily focused on:
- LLM Translation improvements (more `thinking` content improvements)
- UI improvements (Error logs now shown on UI)
:::info
This release will be live on 03/09/2025
:::
<Image img={require('../../img/release_notes/v1632_release.jpg')} />
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
## LLM Translation
<Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
- Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
- Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API.
13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
## Spend Tracking Improvements
1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
## Management Endpoints / UI
<Image img={require('../../img/release_notes/error_logs.jpg')} />
1. Virtual Keys Page
- Allow team/org filters to be searchable on the Create Key Page
- Add created_by and updated_by fields to Keys table
- Show 'user_email' on key table
- Show 100 Keys Per Page, Use full height, increase width of key alias
2. Logs Page
- Show Error Logs on LiteLLM UI
- Allow Internal Users to View their own logs
3. Internal Users Page
- Allow admin to control default model access for internal users
7. Fix session handling with cookies
## Logging / Guardrail Integrations
1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
## Performance / Loadbalancing / Reliability improvements
1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
## General Proxy Improvements
1. Raise BadRequestError when unknown model passed in request
2. Enforce model access restrictions on Azure OpenAI proxy route
3. Reliability fix - Handle emojis in text - fix orjson error
4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
5. Enable setting timezone information in docker image
## Complete Git Diff
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)

View file

@ -41,10 +41,12 @@ const sidebars = {
"proxy/deploy", "proxy/deploy",
"proxy/prod", "proxy/prod",
"proxy/cli", "proxy/cli",
"proxy/release_cycle",
"proxy/model_management", "proxy/model_management",
"proxy/health", "proxy/health",
"proxy/debugging", "proxy/debugging",
"proxy/spending_monitoring", "proxy/spending_monitoring",
"proxy/master_key_rotations",
], ],
}, },
"proxy/demo", "proxy/demo",
@ -99,7 +101,9 @@ const sidebars = {
"proxy/admin_ui_sso", "proxy/admin_ui_sso",
"proxy/self_serve", "proxy/self_serve",
"proxy/public_teams", "proxy/public_teams",
"proxy/custom_sso" "proxy/custom_sso",
"proxy/ui_credentials",
"proxy/ui_logs"
], ],
}, },
{ {
@ -229,6 +233,7 @@ const sidebars = {
"providers/sambanova", "providers/sambanova",
"providers/custom_llm_server", "providers/custom_llm_server",
"providers/petals", "providers/petals",
"providers/snowflake"
], ],
}, },
{ {
@ -255,17 +260,23 @@ const sidebars = {
"completion/batching", "completion/batching",
"completion/mock_requests", "completion/mock_requests",
"completion/reliable_completions", "completion/reliable_completions",
'tutorials/litellm_proxy_aporia',
] ]
}, },
{ {
type: "category", type: "category",
label: "Supported Endpoints", label: "Supported Endpoints",
link: {
type: "generated-index",
title: "Supported Endpoints",
description:
"Learn how to deploy + call models from different providers on LiteLLM",
slug: "/supported_endpoints",
},
items: [ items: [
{ {
type: "category", type: "category",
label: "Chat", label: "/chat/completions",
link: { link: {
type: "generated-index", type: "generated-index",
title: "Chat Completions", title: "Chat Completions",
@ -278,11 +289,13 @@ const sidebars = {
"completion/usage", "completion/usage",
], ],
}, },
"response_api",
"text_completion", "text_completion",
"embedding/supported_embedding", "embedding/supported_embedding",
"anthropic_unified",
{ {
type: "category", type: "category",
label: "Image", label: "/images",
items: [ items: [
"image_generation", "image_generation",
"image_variations", "image_variations",
@ -290,7 +303,7 @@ const sidebars = {
}, },
{ {
type: "category", type: "category",
label: "Audio", label: "/audio",
"items": [ "items": [
"audio_transcription", "audio_transcription",
"text_to_speech", "text_to_speech",
@ -349,23 +362,6 @@ const sidebars = {
label: "LangChain, LlamaIndex, Instructor Integration", label: "LangChain, LlamaIndex, Instructor Integration",
items: ["langchain/langchain", "tutorials/instructor"], items: ["langchain/langchain", "tutorials/instructor"],
}, },
{
type: "category",
label: "Tutorials",
items: [
'tutorials/azure_openai',
'tutorials/instructor',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",
"tutorials/TogetherAI_liteLLM",
"tutorials/finetuned_chat_gpt",
"tutorials/text_completion",
"tutorials/first_playground",
"tutorials/model_fallbacks",
],
},
], ],
}, },
{ {
@ -382,13 +378,6 @@ const sidebars = {
"load_test_rpm", "load_test_rpm",
] ]
}, },
{
type: "category",
label: "Adding Providers",
items: [
"adding_provider/directory_structure",
"adding_provider/new_rerank_provider"],
},
{ {
type: "category", type: "category",
label: "Logging & Observability", label: "Logging & Observability",
@ -423,12 +412,51 @@ const sidebars = {
"observability/opik_integration", "observability/opik_integration",
], ],
}, },
{
type: "category",
label: "Tutorials",
items: [
"tutorials/openweb_ui",
'tutorials/litellm_proxy_aporia',
{
type: "category",
label: "LiteLLM Python SDK Tutorials",
items: [
'tutorials/azure_openai',
'tutorials/instructor',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",
"tutorials/TogetherAI_liteLLM",
"tutorials/finetuned_chat_gpt",
"tutorials/text_completion",
"tutorials/first_playground",
"tutorials/model_fallbacks",
],
},
]
},
{
type: "category",
label: "Contributing",
items: [
"extras/contributing_code",
{
type: "category",
label: "Adding Providers",
items: [
"adding_provider/directory_structure",
"adding_provider/new_rerank_provider"],
},
"extras/contributing",
"contributing",
]
},
{ {
type: "category", type: "category",
label: "Extras", label: "Extras",
items: [ items: [
"extras/contributing",
"data_security", "data_security",
"data_retention", "data_retention",
"migration_policy", "migration_policy",
@ -445,6 +473,7 @@ const sidebars = {
items: [ items: [
"projects/smolagents", "projects/smolagents",
"projects/Docq.AI", "projects/Docq.AI",
"projects/PDL",
"projects/OpenInterpreter", "projects/OpenInterpreter",
"projects/Elroy", "projects/Elroy",
"projects/dbally", "projects/dbally",
@ -460,9 +489,9 @@ const sidebars = {
"projects/YiVal", "projects/YiVal",
"projects/LiteLLM Proxy", "projects/LiteLLM Proxy",
"projects/llm_cord", "projects/llm_cord",
"projects/pgai",
], ],
}, },
"contributing",
"proxy/pii_masking", "proxy/pii_masking",
"extras/code_quality", "extras/code_quality",
"rules", "rules",

View file

@ -163,7 +163,7 @@ class AporiaGuardrail(CustomGuardrail):
pass pass
async def async_moderation_hook( ### 👈 KEY CHANGE ### async def async_moderation_hook(
self, self,
data: dict, data: dict,
user_api_key_dict: UserAPIKeyAuth, user_api_key_dict: UserAPIKeyAuth,
@ -173,6 +173,7 @@ class AporiaGuardrail(CustomGuardrail):
"image_generation", "image_generation",
"moderation", "moderation",
"audio_transcription", "audio_transcription",
"responses",
], ],
): ):
from litellm.proxy.common_utils.callback_utils import ( from litellm.proxy.common_utils.callback_utils import (

View file

@ -94,6 +94,7 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
"image_generation", "image_generation",
"moderation", "moderation",
"audio_transcription", "audio_transcription",
"responses",
], ],
): ):
""" """

View file

@ -107,6 +107,7 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
"image_generation", "image_generation",
"moderation", "moderation",
"audio_transcription", "audio_transcription",
"responses",
], ],
): ):
""" """

View file

@ -126,6 +126,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
"image_generation", "image_generation",
"moderation", "moderation",
"audio_transcription", "audio_transcription",
"responses",
], ],
): ):
""" """

View file

@ -31,7 +31,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
#### CALL HOOKS - proxy only #### #### CALL HOOKS - proxy only ####
async def async_moderation_hook( ### 👈 KEY CHANGE ### async def async_moderation_hook(
self, self,
data: dict, data: dict,
user_api_key_dict: UserAPIKeyAuth, user_api_key_dict: UserAPIKeyAuth,
@ -41,6 +41,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
"image_generation", "image_generation",
"moderation", "moderation",
"audio_transcription", "audio_transcription",
"responses",
], ],
): ):
text = "" text = ""

View file

@ -8,12 +8,14 @@ import os
from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
from litellm.caching.llm_caching_handler import LLMClientCache
from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
from litellm.types.utils import ( from litellm.types.utils import (
ImageObject, ImageObject,
BudgetConfig, BudgetConfig,
all_litellm_params, all_litellm_params,
all_litellm_params as _litellm_completion_params, all_litellm_params as _litellm_completion_params,
CredentialItem,
) # maintain backwards compatibility for root param ) # maintain backwards compatibility for root param
from litellm._logging import ( from litellm._logging import (
set_verbose, set_verbose,
@ -53,6 +55,7 @@ from litellm.constants import (
cohere_embedding_models, cohere_embedding_models,
bedrock_embedding_models, bedrock_embedding_models,
known_tokenizer_config, known_tokenizer_config,
BEDROCK_INVOKE_PROVIDERS_LITERAL,
) )
from litellm.types.guardrails import GuardrailItem from litellm.types.guardrails import GuardrailItem
from litellm.proxy._types import ( from litellm.proxy._types import (
@ -181,6 +184,7 @@ cloudflare_api_key: Optional[str] = None
baseten_key: Optional[str] = None baseten_key: Optional[str] = None
aleph_alpha_key: Optional[str] = None aleph_alpha_key: Optional[str] = None
nlp_cloud_key: Optional[str] = None nlp_cloud_key: Optional[str] = None
snowflake_key: Optional[str] = None
common_cloud_provider_auth_params: dict = { common_cloud_provider_auth_params: dict = {
"params": ["project", "region_name", "token"], "params": ["project", "region_name", "token"],
"providers": ["vertex_ai", "bedrock", "watsonx", "azure", "vertex_ai_beta"], "providers": ["vertex_ai", "bedrock", "watsonx", "azure", "vertex_ai_beta"],
@ -190,15 +194,17 @@ ssl_verify: Union[str, bool] = True
ssl_certificate: Optional[str] = None ssl_certificate: Optional[str] = None
disable_streaming_logging: bool = False disable_streaming_logging: bool = False
disable_add_transform_inline_image_block: bool = False disable_add_transform_inline_image_block: bool = False
in_memory_llm_clients_cache: InMemoryCache = InMemoryCache() in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
safe_memory_mode: bool = False safe_memory_mode: bool = False
enable_azure_ad_token_refresh: Optional[bool] = False enable_azure_ad_token_refresh: Optional[bool] = False
### DEFAULT AZURE API VERSION ### ### DEFAULT AZURE API VERSION ###
AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest AZURE_DEFAULT_API_VERSION = "2025-02-01-preview" # this is updated to the latest
### DEFAULT WATSONX API VERSION ### ### DEFAULT WATSONX API VERSION ###
WATSONX_DEFAULT_API_VERSION = "2024-03-13" WATSONX_DEFAULT_API_VERSION = "2024-03-13"
### COHERE EMBEDDINGS DEFAULT TYPE ### ### COHERE EMBEDDINGS DEFAULT TYPE ###
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document" COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
### CREDENTIALS ###
credential_list: List[CredentialItem] = []
### GUARDRAILS ### ### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None llamaguard_model_name: Optional[str] = None
openai_moderations_model_name: Optional[str] = None openai_moderations_model_name: Optional[str] = None
@ -278,8 +284,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = [] custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION #### #### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None priority_reservation: Optional[Dict[str, float]] = None
force_ipv4: bool = ( force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
) )
@ -363,17 +367,7 @@ BEDROCK_CONVERSE_MODELS = [
"meta.llama3-2-11b-instruct-v1:0", "meta.llama3-2-11b-instruct-v1:0",
"meta.llama3-2-90b-instruct-v1:0", "meta.llama3-2-90b-instruct-v1:0",
] ]
BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
"cohere",
"anthropic",
"mistral",
"amazon",
"meta",
"llama",
"ai21",
"nova",
"deepseek_r1",
]
####### COMPLETION MODELS ################### ####### COMPLETION MODELS ###################
open_ai_chat_completion_models: List = [] open_ai_chat_completion_models: List = []
open_ai_text_completion_models: List = [] open_ai_text_completion_models: List = []
@ -425,6 +419,7 @@ cerebras_models: List = []
galadriel_models: List = [] galadriel_models: List = []
sambanova_models: List = [] sambanova_models: List = []
assemblyai_models: List = [] assemblyai_models: List = []
snowflake_models: List = []
def is_bedrock_pricing_only_model(key: str) -> bool: def is_bedrock_pricing_only_model(key: str) -> bool:
@ -578,6 +573,8 @@ def add_known_models():
assemblyai_models.append(key) assemblyai_models.append(key)
elif value.get("litellm_provider") == "jina_ai": elif value.get("litellm_provider") == "jina_ai":
jina_ai_models.append(key) jina_ai_models.append(key)
elif value.get("litellm_provider") == "snowflake":
snowflake_models.append(key)
add_known_models() add_known_models()
@ -607,6 +604,7 @@ ollama_models = ["llama2"]
maritalk_models = ["maritalk"] maritalk_models = ["maritalk"]
model_list = ( model_list = (
open_ai_chat_completion_models open_ai_chat_completion_models
+ open_ai_text_completion_models + open_ai_text_completion_models
@ -651,6 +649,7 @@ model_list = (
+ azure_text_models + azure_text_models
+ assemblyai_models + assemblyai_models
+ jina_ai_models + jina_ai_models
+ snowflake_models
) )
model_list_set = set(model_list) model_list_set = set(model_list)
@ -706,6 +705,7 @@ models_by_provider: dict = {
"sambanova": sambanova_models, "sambanova": sambanova_models,
"assemblyai": assemblyai_models, "assemblyai": assemblyai_models,
"jina_ai": jina_ai_models, "jina_ai": jina_ai_models,
"snowflake": snowflake_models,
} }
# mapping for those models which have larger equivalents # mapping for those models which have larger equivalents
@ -811,9 +811,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
from .llms.maritalk import MaritalkConfig from .llms.maritalk import MaritalkConfig
from .llms.openrouter.chat.transformation import OpenrouterConfig from .llms.openrouter.chat.transformation import OpenrouterConfig
from .llms.anthropic.chat.transformation import AnthropicConfig from .llms.anthropic.chat.transformation import AnthropicConfig
from .llms.anthropic.experimental_pass_through.transformation import (
AnthropicExperimentalPassThroughConfig,
)
from .llms.groq.stt.transformation import GroqSTTConfig from .llms.groq.stt.transformation import GroqSTTConfig
from .llms.anthropic.completion.transformation import AnthropicTextConfig from .llms.anthropic.completion.transformation import AnthropicTextConfig
from .llms.triton.completion.transformation import TritonConfig from .llms.triton.completion.transformation import TritonConfig
@ -825,6 +822,7 @@ from .llms.databricks.embed.transformation import DatabricksEmbeddingConfig
from .llms.predibase.chat.transformation import PredibaseConfig from .llms.predibase.chat.transformation import PredibaseConfig
from .llms.replicate.chat.transformation import ReplicateConfig from .llms.replicate.chat.transformation import ReplicateConfig
from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig
from .llms.snowflake.chat.transformation import SnowflakeConfig
from .llms.cohere.rerank.transformation import CohereRerankConfig from .llms.cohere.rerank.transformation import CohereRerankConfig
from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config
from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
@ -832,6 +830,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
from .llms.clarifai.chat.transformation import ClarifaiConfig from .llms.clarifai.chat.transformation import ClarifaiConfig
from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
from .llms.anthropic.experimental_pass_through.messages.transformation import (
AnthropicMessagesConfig,
)
from .llms.together_ai.chat import TogetherAIConfig from .llms.together_ai.chat import TogetherAIConfig
from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
from .llms.cloudflare.chat.transformation import CloudflareChatConfig from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -912,6 +913,7 @@ from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import
from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
from .llms.bedrock.image.amazon_nova_canvas_transformation import AmazonNovaCanvasConfig
from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
from .llms.bedrock.embed.amazon_titan_multimodal_transformation import ( from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
AmazonTitanMultimodalEmbeddingG1Config, AmazonTitanMultimodalEmbeddingG1Config,
@ -934,11 +936,14 @@ from .llms.groq.chat.transformation import GroqChatConfig
from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
from .llms.azure_ai.chat.transformation import AzureAIStudioConfig from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
from .llms.mistral.mistral_chat_transformation import MistralConfig from .llms.mistral.mistral_chat_transformation import MistralConfig
from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from .llms.openai.chat.o_series_transformation import ( from .llms.openai.chat.o_series_transformation import (
OpenAIOSeriesConfig as OpenAIO1Config, # maintain backwards compatibility OpenAIOSeriesConfig as OpenAIO1Config, # maintain backwards compatibility
OpenAIOSeriesConfig, OpenAIOSeriesConfig,
) )
from .llms.snowflake.chat.transformation import SnowflakeConfig
openaiOSeriesConfig = OpenAIOSeriesConfig() openaiOSeriesConfig = OpenAIOSeriesConfig()
from .llms.openai.chat.gpt_transformation import ( from .llms.openai.chat.gpt_transformation import (
OpenAIGPTConfig, OpenAIGPTConfig,
@ -1022,6 +1027,8 @@ from .assistants.main import *
from .batches.main import * from .batches.main import *
from .batch_completion.main import * # type: ignore from .batch_completion.main import * # type: ignore
from .rerank_api.main import * from .rerank_api.main import *
from .llms.anthropic.experimental_pass_through.messages.handler import *
from .responses.main import *
from .realtime_api.main import _arealtime from .realtime_api.main import _arealtime
from .fine_tuning.main import * from .fine_tuning.main import *
from .files.main import * from .files.main import *

View file

@ -182,9 +182,7 @@ def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:
"REDIS_CLUSTER_NODES environment variable is not valid JSON. Please ensure it's properly formatted." "REDIS_CLUSTER_NODES environment variable is not valid JSON. Please ensure it's properly formatted."
) )
verbose_logger.debug( verbose_logger.debug("init_redis_cluster: startup nodes are being initialized.")
"init_redis_cluster: startup nodes are being initialized."
)
from redis.cluster import ClusterNode from redis.cluster import ClusterNode
args = _get_redis_cluster_kwargs() args = _get_redis_cluster_kwargs()
@ -307,7 +305,6 @@ def get_redis_async_client(
return _init_async_redis_sentinel(redis_kwargs) return _init_async_redis_sentinel(redis_kwargs)
return async_redis.Redis( return async_redis.Redis(
socket_timeout=5,
**redis_kwargs, **redis_kwargs,
) )

View file

@ -1,186 +0,0 @@
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import traceback
from typing import Any, Optional
import litellm
from litellm import ChatCompletionRequest, verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
class AnthropicAdapter(CustomLogger):
def __init__(self) -> None:
super().__init__()
def translate_completion_input_params(
self, kwargs
) -> Optional[ChatCompletionRequest]:
"""
- translate params, where needed
- pass rest, as is
"""
request_body = AnthropicMessagesRequest(**kwargs) # type: ignore
translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
anthropic_message_request=request_body
)
return translated_body
def translate_completion_output_params(
self, response: ModelResponse
) -> Optional[AnthropicResponse]:
return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
response=response
)
def translate_completion_output_params_streaming(
self, completion_stream: Any
) -> AdapterCompletionStreamWrapper | None:
return AnthropicStreamWrapper(completion_stream=completion_stream)
anthropic_adapter = AnthropicAdapter()
class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
"""
- first chunk return 'message_start'
- content block must be started and stopped
- finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
"""
sent_first_chunk: bool = False
sent_content_block_start: bool = False
sent_content_block_finish: bool = False
sent_last_message: bool = False
holding_chunk: Optional[Any] = None
def __next__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except Exception as e:
verbose_logger.error(
"Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
)
async def __anext__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
async for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopAsyncIteration

View file

@ -15,6 +15,7 @@ import litellm
from litellm.types.router import GenericLiteLLMParams from litellm.types.router import GenericLiteLLMParams
from litellm.utils import ( from litellm.utils import (
exception_type, exception_type,
get_litellm_params,
get_llm_provider, get_llm_provider,
get_secret, get_secret,
supports_httpx_timeout, supports_httpx_timeout,
@ -86,6 +87,7 @@ def get_assistants(
optional_params = GenericLiteLLMParams( optional_params = GenericLiteLLMParams(
api_key=api_key, api_base=api_base, api_version=api_version, **kwargs api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
) )
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -169,6 +171,7 @@ def get_assistants(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
aget_assistants=aget_assistants, # type: ignore aget_assistants=aget_assistants, # type: ignore
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -270,6 +273,7 @@ def create_assistants(
optional_params = GenericLiteLLMParams( optional_params = GenericLiteLLMParams(
api_key=api_key, api_base=api_base, api_version=api_version, **kwargs api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
) )
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -371,6 +375,7 @@ def create_assistants(
client=client, client=client,
async_create_assistants=async_create_assistants, async_create_assistants=async_create_assistants,
create_assistant_data=create_assistant_data, create_assistant_data=create_assistant_data,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -445,6 +450,8 @@ def delete_assistant(
api_key=api_key, api_base=api_base, api_version=api_version, **kwargs api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
) )
litellm_params_dict = get_litellm_params(**kwargs)
async_delete_assistants: Optional[bool] = kwargs.pop( async_delete_assistants: Optional[bool] = kwargs.pop(
"async_delete_assistants", None "async_delete_assistants", None
) )
@ -544,6 +551,7 @@ def delete_assistant(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
async_delete_assistants=async_delete_assistants, async_delete_assistants=async_delete_assistants,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -639,6 +647,7 @@ def create_thread(
""" """
acreate_thread = kwargs.get("acreate_thread", None) acreate_thread = kwargs.get("acreate_thread", None)
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -731,6 +740,7 @@ def create_thread(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
acreate_thread=acreate_thread, acreate_thread=acreate_thread,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -795,7 +805,7 @@ def get_thread(
"""Get the thread object, given a thread_id""" """Get the thread object, given a thread_id"""
aget_thread = kwargs.pop("aget_thread", None) aget_thread = kwargs.pop("aget_thread", None)
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default # set timeout for 10 minutes by default
@ -884,6 +894,7 @@ def get_thread(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
aget_thread=aget_thread, aget_thread=aget_thread,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -972,6 +983,7 @@ def add_message(
_message_data = MessageData( _message_data = MessageData(
role=role, content=content, attachments=attachments, metadata=metadata role=role, content=content, attachments=attachments, metadata=metadata
) )
litellm_params_dict = get_litellm_params(**kwargs)
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
message_data = get_optional_params_add_message( message_data = get_optional_params_add_message(
@ -1068,6 +1080,7 @@ def add_message(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
a_add_message=a_add_message, a_add_message=a_add_message,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -1139,6 +1152,7 @@ def get_messages(
) -> SyncCursorPage[OpenAIMessage]: ) -> SyncCursorPage[OpenAIMessage]:
aget_messages = kwargs.pop("aget_messages", None) aget_messages = kwargs.pop("aget_messages", None)
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1225,6 +1239,7 @@ def get_messages(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
aget_messages=aget_messages, aget_messages=aget_messages,
litellm_params=litellm_params_dict,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -1337,6 +1352,7 @@ def run_thread(
"""Run a given thread + assistant.""" """Run a given thread + assistant."""
arun_thread = kwargs.pop("arun_thread", None) arun_thread = kwargs.pop("arun_thread", None)
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params_dict = get_litellm_params(**kwargs)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1437,6 +1453,7 @@ def run_thread(
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
client=client, client=client,
arun_thread=arun_thread, arun_thread=arun_thread,
litellm_params=litellm_params_dict,
) # type: ignore ) # type: ignore
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(

View file

@ -1,76 +1,16 @@
import asyncio
import datetime
import json import json
import threading from typing import Any, List, Literal, Tuple
from typing import Any, List, Literal, Optional
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.constants import (
BATCH_STATUS_POLL_INTERVAL_SECONDS,
BATCH_STATUS_POLL_MAX_ATTEMPTS,
)
from litellm.files.main import afile_content
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.llms.openai import Batch from litellm.types.llms.openai import Batch
from litellm.types.utils import StandardLoggingPayload, Usage from litellm.types.utils import CallTypes, Usage
async def batches_async_logging(
batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
logging_obj: Optional[LiteLLMLoggingObj] = None,
**kwargs,
):
"""
Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
Polls retrieve_batch until it returns a batch with status "completed" or "failed"
"""
from .main import aretrieve_batch
verbose_logger.debug(
".....in _batches_async_logging... polling retrieve to get batch status"
)
if logging_obj is None:
raise ValueError(
"logging_obj is None cannot calculate cost / log batch creation event"
)
for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
try:
start_time = datetime.datetime.now()
batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
verbose_logger.debug(
"in _batches_async_logging... batch status= %s", batch.status
)
if batch.status == "completed":
end_time = datetime.datetime.now()
await _handle_completed_batch(
batch=batch,
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
start_time=start_time,
end_time=end_time,
**kwargs,
)
break
elif batch.status == "failed":
pass
except Exception as e:
verbose_logger.error("error in batches_async_logging", e)
await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
async def _handle_completed_batch( async def _handle_completed_batch(
batch: Batch, batch: Batch,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"], custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
logging_obj: LiteLLMLoggingObj, ) -> Tuple[float, Usage, List[str]]:
start_time: datetime.datetime,
end_time: datetime.datetime,
**kwargs,
) -> None:
"""Helper function to process a completed batch and handle logging""" """Helper function to process a completed batch and handle logging"""
# Get batch results # Get batch results
file_content_dictionary = await _get_batch_output_file_content_as_dictionary( file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,49 +27,25 @@ async def _handle_completed_batch(
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
) )
# Handle logging batch_models = _get_batch_models_from_file_content(file_content_dictionary)
await _log_completed_batch(
logging_obj=logging_obj, return batch_cost, batch_usage, batch_models
batch_usage=batch_usage,
batch_cost=batch_cost,
start_time=start_time,
end_time=end_time,
**kwargs,
)
async def _log_completed_batch( def _get_batch_models_from_file_content(
logging_obj: LiteLLMLoggingObj, file_content_dictionary: List[dict],
batch_usage: Usage, ) -> List[str]:
batch_cost: float, """
start_time: datetime.datetime, Get the models from the file content
end_time: datetime.datetime, """
**kwargs, batch_models = []
) -> None: for _item in file_content_dictionary:
"""Helper function to handle all logging operations for a completed batch""" if _batch_response_was_successful(_item):
logging_obj.call_type = "batch_success" _response_body = _get_response_from_batch_job_output_file(_item)
_model = _response_body.get("model")
standard_logging_object = _create_standard_logging_object_for_completed_batch( if _model:
kwargs=kwargs, batch_models.append(_model)
start_time=start_time, return batch_models
end_time=end_time,
logging_obj=logging_obj,
batch_usage_object=batch_usage,
response_cost=batch_cost,
)
logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
# Launch async and sync logging handlers
asyncio.create_task(
logging_obj.async_success_handler(
result=None,
start_time=start_time,
end_time=end_time,
cache_hit=None,
)
)
logging_obj.success_handler(None, start_time, end_time)
async def _batch_cost_calculator( async def _batch_cost_calculator(
@ -156,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
""" """
Get the batch output file content as a list of dictionaries Get the batch output file content as a list of dictionaries
""" """
from litellm.files.main import afile_content
if custom_llm_provider == "vertex_ai": if custom_llm_provider == "vertex_ai":
raise ValueError("Vertex AI does not support file content retrieval") raise ValueError("Vertex AI does not support file content retrieval")
@ -205,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
total_cost += litellm.completion_cost( total_cost += litellm.completion_cost(
completion_response=_response_body, completion_response=_response_body,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
call_type=CallTypes.aretrieve_batch.value,
) )
verbose_logger.debug("total_cost=%s", total_cost) verbose_logger.debug("total_cost=%s", total_cost)
return total_cost return total_cost
@ -261,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
""" """
_response: dict = batch_job_output_file.get("response", None) or {} _response: dict = batch_job_output_file.get("response", None) or {}
return _response.get("status_code", None) == 200 return _response.get("status_code", None) == 200
def _create_standard_logging_object_for_completed_batch(
kwargs: dict,
start_time: datetime.datetime,
end_time: datetime.datetime,
logging_obj: LiteLLMLoggingObj,
batch_usage_object: Usage,
response_cost: float,
) -> StandardLoggingPayload:
"""
Create a standard logging object for a completed batch
"""
standard_logging_object = logging_obj.model_call_details.get(
"standard_logging_object", None
)
if standard_logging_object is None:
raise ValueError("unable to create standard logging object for completed batch")
# Add Completed Batch Job Usage and Response Cost
standard_logging_object["call_type"] = "batch_success"
standard_logging_object["response_cost"] = response_cost
standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
return standard_logging_object

View file

@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
RetrieveBatchRequest, RetrieveBatchRequest,
) )
from litellm.types.router import GenericLiteLLMParams from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import LiteLLMBatch
from litellm.utils import client, get_litellm_params, supports_httpx_timeout from litellm.utils import client, get_litellm_params, supports_httpx_timeout
from .batch_utils import batches_async_logging
####### ENVIRONMENT VARIABLES ################### ####### ENVIRONMENT VARIABLES ###################
openai_batches_instance = OpenAIBatchesAPI() openai_batches_instance = OpenAIBatchesAPI()
azure_batches_instance = AzureBatchesAPI() azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
else: else:
response = init_response response = init_response
# Start async logging job
if response is not None:
asyncio.create_task(
batches_async_logging(
logging_obj=kwargs.get("litellm_logging_obj", None),
batch_id=response.id,
custom_llm_provider=custom_llm_provider,
**kwargs,
)
)
return response return response
except Exception as e: except Exception as e:
raise e raise e
@ -111,7 +99,7 @@ def create_batch(
extra_headers: Optional[Dict[str, str]] = None, extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None, extra_body: Optional[Dict[str, str]] = None,
**kwargs, **kwargs,
) -> Union[Batch, Coroutine[Any, Any, Batch]]: ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
""" """
Creates and executes a batch from an uploaded file of request Creates and executes a batch from an uploaded file of request
@ -119,21 +107,27 @@ def create_batch(
""" """
try: try:
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_call_id = kwargs.get("litellm_call_id", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
_is_async = kwargs.pop("acreate_batch", False) is True _is_async = kwargs.pop("acreate_batch", False) is True
litellm_params = get_litellm_params(**kwargs)
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None) litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
litellm_call_id=kwargs.get("litellm_call_id", None),
litellm_trace_id=kwargs.get("litellm_trace_id"),
litellm_metadata=kwargs.get("litellm_metadata"),
)
litellm_logging_obj.update_environment_variables( litellm_logging_obj.update_environment_variables(
model=None, model=None,
user=None, user=None,
optional_params=optional_params.model_dump(), optional_params=optional_params.model_dump(),
litellm_params=litellm_params, litellm_params={
"litellm_call_id": litellm_call_id,
"proxy_server_request": proxy_server_request,
"model_info": model_info,
"metadata": metadata,
"preset_cache_key": None,
"stream_response": {},
**optional_params.model_dump(exclude_unset=True),
},
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
) )
@ -224,6 +218,7 @@ def create_batch(
timeout=timeout, timeout=timeout,
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
create_batch_data=_create_batch_request, create_batch_data=_create_batch_request,
litellm_params=litellm_params,
) )
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
api_base = optional_params.api_base or "" api_base = optional_params.api_base or ""
@ -261,7 +256,7 @@ def create_batch(
response=httpx.Response( response=httpx.Response(
status_code=400, status_code=400,
content="Unsupported provider", content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"), # type: ignore
), ),
) )
return response return response
@ -269,6 +264,7 @@ def create_batch(
raise e raise e
@client
async def aretrieve_batch( async def aretrieve_batch(
batch_id: str, batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai", custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +272,7 @@ async def aretrieve_batch(
extra_headers: Optional[Dict[str, str]] = None, extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None, extra_body: Optional[Dict[str, str]] = None,
**kwargs, **kwargs,
) -> Batch: ) -> LiteLLMBatch:
""" """
Async: Retrieves a batch. Async: Retrieves a batch.
@ -310,6 +306,7 @@ async def aretrieve_batch(
raise e raise e
@client
def retrieve_batch( def retrieve_batch(
batch_id: str, batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai", custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +314,7 @@ def retrieve_batch(
extra_headers: Optional[Dict[str, str]] = None, extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None, extra_body: Optional[Dict[str, str]] = None,
**kwargs, **kwargs,
) -> Union[Batch, Coroutine[Any, Any, Batch]]: ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
""" """
Retrieves a batch. Retrieves a batch.
@ -325,9 +322,20 @@ def retrieve_batch(
""" """
try: try:
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
**kwargs,
)
litellm_logging_obj.update_environment_variables(
model=None,
user=None,
optional_params=optional_params.model_dump(),
litellm_params=litellm_params,
custom_llm_provider=custom_llm_provider,
)
if ( if (
timeout is not None timeout is not None
@ -415,6 +423,7 @@ def retrieve_batch(
timeout=timeout, timeout=timeout,
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
retrieve_batch_data=_retrieve_batch_request, retrieve_batch_data=_retrieve_batch_request,
litellm_params=litellm_params,
) )
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
api_base = optional_params.api_base or "" api_base = optional_params.api_base or ""
@ -517,6 +526,10 @@ def list_batches(
try: try:
# set API KEY # set API KEY
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
**kwargs,
)
api_key = ( api_key = (
optional_params.api_key optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
@ -594,6 +607,7 @@ def list_batches(
api_version=api_version, api_version=api_version,
timeout=timeout, timeout=timeout,
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
litellm_params=litellm_params,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(
@ -669,6 +683,10 @@ def cancel_batch(
""" """
try: try:
optional_params = GenericLiteLLMParams(**kwargs) optional_params = GenericLiteLLMParams(**kwargs)
litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
**kwargs,
)
### TIMEOUT LOGIC ### ### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600 timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default # set timeout for 10 minutes by default
@ -756,6 +774,7 @@ def cancel_batch(
timeout=timeout, timeout=timeout,
max_retries=optional_params.max_retries, max_retries=optional_params.max_retries,
cancel_batch_data=_cancel_batch_request, cancel_batch_data=_cancel_batch_request,
litellm_params=litellm_params,
) )
else: else:
raise litellm.exceptions.BadRequestError( raise litellm.exceptions.BadRequestError(

View file

@ -13,26 +13,14 @@ import json
import time import time
import traceback import traceback
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Optional, Set, Union from typing import Any, Dict, List, Optional, Union
from openai.types.audio.transcription_create_params import TranscriptionCreateParams
from openai.types.chat.completion_create_params import (
CompletionCreateParamsNonStreaming,
CompletionCreateParamsStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
)
from openai.types.embedding_create_params import EmbeddingCreateParams
from pydantic import BaseModel from pydantic import BaseModel
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.types.caching import * from litellm.types.caching import *
from litellm.types.rerank import RerankRequest
from litellm.types.utils import all_litellm_params from litellm.types.utils import all_litellm_params
from .base_cache import BaseCache from .base_cache import BaseCache
@ -257,7 +245,7 @@ class Cache:
verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key) verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
return preset_cache_key return preset_cache_key
combined_kwargs = self._get_relevant_args_to_use_for_cache_key() combined_kwargs = ModelParamHelper._get_all_llm_api_params()
litellm_param_kwargs = all_litellm_params litellm_param_kwargs = all_litellm_params
for param in kwargs: for param in kwargs:
if param in combined_kwargs: if param in combined_kwargs:
@ -364,76 +352,6 @@ class Cache:
if "litellm_params" in kwargs: if "litellm_params" in kwargs:
kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key
def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
"""
Gets the supported kwargs for each call type and combines them
"""
chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
combined_kwargs = chat_completion_kwargs.union(
text_completion_kwargs,
embedding_kwargs,
transcription_kwargs,
rerank_kwargs,
)
combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs
def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
"""
Get the litellm supported chat completion kwargs
This follows the OpenAI API Spec
"""
all_chat_completion_kwargs = set(
CompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
return all_chat_completion_kwargs
def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
"""
Get the litellm supported text completion kwargs
This follows the OpenAI API Spec
"""
all_text_completion_kwargs = set(
TextCompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
return all_text_completion_kwargs
def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
"""
Get the litellm supported rerank kwargs
"""
return set(RerankRequest.model_fields.keys())
def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
"""
Get the litellm supported embedding kwargs
This follows the OpenAI API Spec
"""
return set(EmbeddingCreateParams.__annotations__.keys())
def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
"""
Get the litellm supported transcription kwargs
This follows the OpenAI API Spec
"""
return set(TranscriptionCreateParams.__annotations__.keys())
def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
"""
Get the kwargs to exclude from the cache key
"""
return set(["metadata"])
@staticmethod @staticmethod
def _get_hashed_cache_key(cache_key: str) -> str: def _get_hashed_cache_key(cache_key: str) -> str:
""" """

View file

@ -247,7 +247,6 @@ class LLMCachingHandler:
pass pass
else: else:
call_type = original_function.__name__ call_type = original_function.__name__
cached_result = self._convert_cached_result_to_model_response( cached_result = self._convert_cached_result_to_model_response(
cached_result=cached_result, cached_result=cached_result,
call_type=call_type, call_type=call_type,
@ -719,6 +718,7 @@ class LLMCachingHandler:
""" """
Sync internal method to add the result to the cache Sync internal method to add the result to the cache
""" """
new_kwargs = kwargs.copy() new_kwargs = kwargs.copy()
new_kwargs.update( new_kwargs.update(
convert_args_to_kwargs( convert_args_to_kwargs(
@ -732,6 +732,7 @@ class LLMCachingHandler:
if self._should_store_result_in_cache( if self._should_store_result_in_cache(
original_function=self.original_function, kwargs=new_kwargs original_function=self.original_function, kwargs=new_kwargs
): ):
litellm.cache.add_cache(result, **new_kwargs) litellm.cache.add_cache(result, **new_kwargs)
return return
@ -783,6 +784,7 @@ class LLMCachingHandler:
- Else append the chunk to self.async_streaming_chunks - Else append the chunk to self.async_streaming_chunks
""" """
complete_streaming_response: Optional[ complete_streaming_response: Optional[
Union[ModelResponse, TextCompletionResponse] Union[ModelResponse, TextCompletionResponse]
] = _assemble_complete_response_from_streaming_chunks( ] = _assemble_complete_response_from_streaming_chunks(
@ -793,7 +795,6 @@ class LLMCachingHandler:
streaming_chunks=self.async_streaming_chunks, streaming_chunks=self.async_streaming_chunks,
is_async=True, is_async=True,
) )
# if a complete_streaming_response is assembled, add it to the cache # if a complete_streaming_response is assembled, add it to the cache
if complete_streaming_response is not None: if complete_streaming_response is not None:
await self.async_set_cache( await self.async_set_cache(

View file

@ -0,0 +1,40 @@
"""
Add the event loop to the cache key, to prevent event loop closed errors.
"""
import asyncio
from .in_memory_cache import InMemoryCache
class LLMClientCache(InMemoryCache):
def update_cache_key_with_event_loop(self, key):
"""
Add the event loop to the cache key, to prevent event loop closed errors.
If none, use the key as is.
"""
try:
event_loop = asyncio.get_event_loop()
stringified_event_loop = str(id(event_loop))
return f"{key}-{stringified_event_loop}"
except Exception: # handle no current event loop
return key
def set_cache(self, key, value, **kwargs):
key = self.update_cache_key_with_event_loop(key)
return super().set_cache(key, value, **kwargs)
async def async_set_cache(self, key, value, **kwargs):
key = self.update_cache_key_with_event_loop(key)
return await super().async_set_cache(key, value, **kwargs)
def get_cache(self, key, **kwargs):
key = self.update_cache_key_with_event_loop(key)
return super().get_cache(key, **kwargs)
async def async_get_cache(self, key, **kwargs):
key = self.update_cache_key_with_event_loop(key)
return await super().async_get_cache(key, **kwargs)

View file

@ -54,6 +54,7 @@ class RedisCache(BaseCache):
redis_flush_size: Optional[int] = 100, redis_flush_size: Optional[int] = 100,
namespace: Optional[str] = None, namespace: Optional[str] = None,
startup_nodes: Optional[List] = None, # for redis-cluster startup_nodes: Optional[List] = None, # for redis-cluster
socket_timeout: Optional[float] = 5.0, # default 5 second timeout
**kwargs, **kwargs,
): ):
@ -70,6 +71,9 @@ class RedisCache(BaseCache):
redis_kwargs["password"] = password redis_kwargs["password"] = password
if startup_nodes is not None: if startup_nodes is not None:
redis_kwargs["startup_nodes"] = startup_nodes redis_kwargs["startup_nodes"] = startup_nodes
if socket_timeout is not None:
redis_kwargs["socket_timeout"] = socket_timeout
### HEALTH MONITORING OBJECT ### ### HEALTH MONITORING OBJECT ###
if kwargs.get("service_logger_obj", None) is not None and isinstance( if kwargs.get("service_logger_obj", None) is not None and isinstance(
kwargs["service_logger_obj"], ServiceLogging kwargs["service_logger_obj"], ServiceLogging
@ -543,6 +547,7 @@ class RedisCache(BaseCache):
_redis_client: Redis = self.init_async_client() # type: ignore _redis_client: Redis = self.init_async_client() # type: ignore
start_time = time.time() start_time = time.time()
_used_ttl = self.get_ttl(ttl=ttl) _used_ttl = self.get_ttl(ttl=ttl)
key = self.check_and_fix_namespace(key=key)
try: try:
result = await _redis_client.incrbyfloat(name=key, amount=value) result = await _redis_client.incrbyfloat(name=key, amount=value)
if _used_ttl is not None: if _used_ttl is not None:
@ -555,6 +560,7 @@ class RedisCache(BaseCache):
## LOGGING ## ## LOGGING ##
end_time = time.time() end_time = time.time()
_duration = end_time - start_time _duration = end_time - start_time
asyncio.create_task( asyncio.create_task(
self.service_logger_obj.async_service_success_hook( self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS, service=ServiceTypes.REDIS,

View file

@ -1,4 +1,4 @@
from typing import List from typing import List, Literal
ROUTER_MAX_FALLBACKS = 5 ROUTER_MAX_FALLBACKS = 5
DEFAULT_BATCH_SIZE = 512 DEFAULT_BATCH_SIZE = 512
@ -18,6 +18,7 @@ SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
#### Networking settings #### #### Networking settings ####
request_timeout: float = 6000 # time in seconds request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]"
LITELLM_CHAT_PROVIDERS = [ LITELLM_CHAT_PROVIDERS = [
"openai", "openai",
@ -320,6 +321,17 @@ baseten_models: List = [
"31dxrj3", "31dxrj3",
] # FALCON 7B # WizardLM # Mosaic ML ] # FALCON 7B # WizardLM # Mosaic ML
BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
"cohere",
"anthropic",
"mistral",
"amazon",
"meta",
"llama",
"ai21",
"nova",
"deepseek_r1",
]
open_ai_embedding_models: List = ["text-embedding-ada-002"] open_ai_embedding_models: List = ["text-embedding-ada-002"]
cohere_embedding_models: List = [ cohere_embedding_models: List = [

View file

@ -44,7 +44,12 @@ from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_ro
from litellm.llms.vertex_ai.image_generation.cost_calculator import ( from litellm.llms.vertex_ai.image_generation.cost_calculator import (
cost_calculator as vertex_ai_image_cost_calculator, cost_calculator as vertex_ai_image_cost_calculator,
) )
from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.responses.utils import ResponseAPILoggingUtils
from litellm.types.llms.openai import (
HttpxBinaryResponseContent,
ResponseAPIUsage,
ResponsesAPIResponse,
)
from litellm.types.rerank import RerankBilledUnits, RerankResponse from litellm.types.rerank import RerankBilledUnits, RerankResponse
from litellm.types.utils import ( from litellm.types.utils import (
CallTypesLiteral, CallTypesLiteral,
@ -239,6 +244,15 @@ def cost_per_token( # noqa: PLR0915
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
billed_units=rerank_billed_units, billed_units=rerank_billed_units,
) )
elif (
call_type == "aretrieve_batch"
or call_type == "retrieve_batch"
or call_type == CallTypes.aretrieve_batch
or call_type == CallTypes.retrieve_batch
):
return batch_cost_calculator(
usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
)
elif call_type == "atranscription" or call_type == "transcription": elif call_type == "atranscription" or call_type == "transcription":
return openai_cost_per_second( return openai_cost_per_second(
model=model, model=model,
@ -399,9 +413,12 @@ def _select_model_name_for_cost_calc(
if base_model is not None: if base_model is not None:
return_model = base_model return_model = base_model
completion_response_model: Optional[str] = getattr( completion_response_model: Optional[str] = None
completion_response, "model", None if completion_response is not None:
) if isinstance(completion_response, BaseModel):
completion_response_model = getattr(completion_response, "model", None)
elif isinstance(completion_response, dict):
completion_response_model = completion_response.get("model", None)
hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None) hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
if completion_response_model is None and hidden_params is not None: if completion_response_model is None and hidden_params is not None:
if ( if (
@ -452,6 +469,13 @@ def _get_usage_object(
return usage_obj return usage_obj
def _is_known_usage_objects(usage_obj):
"""Returns True if the usage obj is a known Usage type"""
return isinstance(usage_obj, litellm.Usage) or isinstance(
usage_obj, ResponseAPIUsage
)
def _infer_call_type( def _infer_call_type(
call_type: Optional[CallTypesLiteral], completion_response: Any call_type: Optional[CallTypesLiteral], completion_response: Any
) -> Optional[CallTypesLiteral]: ) -> Optional[CallTypesLiteral]:
@ -561,9 +585,7 @@ def completion_cost( # noqa: PLR0915
base_model=base_model, base_model=base_model,
) )
verbose_logger.debug( verbose_logger.info(f"selected model name for cost calculation: {model}")
f"completion_response _select_model_name_for_cost_calc: {model}"
)
if completion_response is not None and ( if completion_response is not None and (
isinstance(completion_response, BaseModel) isinstance(completion_response, BaseModel)
@ -575,8 +597,8 @@ def completion_cost( # noqa: PLR0915
) )
else: else:
usage_obj = getattr(completion_response, "usage", {}) usage_obj = getattr(completion_response, "usage", {})
if isinstance(usage_obj, BaseModel) and not isinstance( if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
usage_obj, litellm.Usage usage_obj=usage_obj
): ):
setattr( setattr(
completion_response, completion_response,
@ -589,6 +611,14 @@ def completion_cost( # noqa: PLR0915
_usage = usage_obj.model_dump() _usage = usage_obj.model_dump()
else: else:
_usage = usage_obj _usage = usage_obj
if ResponseAPILoggingUtils._is_response_api_usage(_usage):
_usage = (
ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
_usage
).model_dump()
)
# get input/output tokens from completion_response # get input/output tokens from completion_response
prompt_tokens = _usage.get("prompt_tokens", 0) prompt_tokens = _usage.get("prompt_tokens", 0)
completion_tokens = _usage.get("completion_tokens", 0) completion_tokens = _usage.get("completion_tokens", 0)
@ -778,6 +808,23 @@ def completion_cost( # noqa: PLR0915
raise e raise e
def get_response_cost_from_hidden_params(
hidden_params: Union[dict, BaseModel]
) -> Optional[float]:
if isinstance(hidden_params, BaseModel):
_hidden_params_dict = hidden_params.model_dump()
else:
_hidden_params_dict = hidden_params
additional_headers = _hidden_params_dict.get("additional_headers", {})
if additional_headers and "x-litellm-response-cost" in additional_headers:
response_cost = additional_headers["x-litellm-response-cost"]
if response_cost is None:
return None
return float(additional_headers["x-litellm-response-cost"])
return None
def response_cost_calculator( def response_cost_calculator(
response_object: Union[ response_object: Union[
ModelResponse, ModelResponse,
@ -787,6 +834,7 @@ def response_cost_calculator(
TextCompletionResponse, TextCompletionResponse,
HttpxBinaryResponseContent, HttpxBinaryResponseContent,
RerankResponse, RerankResponse,
ResponsesAPIResponse,
], ],
model: str, model: str,
custom_llm_provider: Optional[str], custom_llm_provider: Optional[str],
@ -813,7 +861,7 @@ def response_cost_calculator(
base_model: Optional[str] = None, base_model: Optional[str] = None,
custom_pricing: Optional[bool] = None, custom_pricing: Optional[bool] = None,
prompt: str = "", prompt: str = "",
) -> Optional[float]: ) -> float:
""" """
Returns Returns
- float or None: cost of response - float or None: cost of response
@ -825,6 +873,14 @@ def response_cost_calculator(
else: else:
if isinstance(response_object, BaseModel): if isinstance(response_object, BaseModel):
response_object._hidden_params["optional_params"] = optional_params response_object._hidden_params["optional_params"] = optional_params
if hasattr(response_object, "_hidden_params"):
provider_response_cost = get_response_cost_from_hidden_params(
response_object._hidden_params
)
if provider_response_cost is not None:
return provider_response_cost
response_cost = completion_cost( response_cost = completion_cost(
completion_response=response_object, completion_response=response_object,
model=model, model=model,
@ -957,3 +1013,54 @@ def default_image_cost_calculator(
) )
return cost_info["input_cost_per_pixel"] * height * width * n return cost_info["input_cost_per_pixel"] * height * width * n
def batch_cost_calculator(
usage: Usage,
model: str,
custom_llm_provider: Optional[str] = None,
) -> Tuple[float, float]:
"""
Calculate the cost of a batch job
"""
_, custom_llm_provider, _, _ = litellm.get_llm_provider(
model=model, custom_llm_provider=custom_llm_provider
)
verbose_logger.info(
"Calculating batch cost per token. model=%s, custom_llm_provider=%s",
model,
custom_llm_provider,
)
try:
model_info: Optional[ModelInfo] = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
except Exception:
model_info = None
if not model_info:
return 0.0, 0.0
input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
input_cost_per_token = model_info.get("input_cost_per_token")
output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
output_cost_per_token = model_info.get("output_cost_per_token")
total_prompt_cost = 0.0
total_completion_cost = 0.0
if input_cost_per_token_batches:
total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
elif input_cost_per_token:
total_prompt_cost = (
usage.prompt_tokens * (input_cost_per_token) / 2
) # batch cost is usually half of the regular token cost
if output_cost_per_token_batches:
total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
elif output_cost_per_token:
total_completion_cost = (
usage.completion_tokens * (output_cost_per_token) / 2
) # batch cost is usually half of the regular token cost
return total_prompt_cost, total_completion_cost

Some files were not shown because too many files have changed in this diff Show more