Merge branch 'main' into stevefarthing/bing-search-pass-thru

This commit is contained in:
Steve Farthing 2025-03-11 08:06:56 -04:00 committed by GitHub
commit dbfb7ebdaf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
741 changed files with 66437 additions and 15378 deletions

View file

@ -1,6 +1,8 @@
version: 2.1
orbs:
codecov: codecov/codecov@4.0.1
node: circleci/node@5.1.0 # Add this line to declare the node orb
jobs:
local_testing:
@ -70,6 +72,7 @@ jobs:
pip install "jsonschema==4.22.0"
pip install "pytest-xdist==3.6.1"
pip install "websockets==10.4"
pip uninstall posthog -y
- save_cache:
paths:
- ./venv
@ -415,6 +418,56 @@ jobs:
paths:
- litellm_router_coverage.xml
- litellm_router_coverage
litellm_proxy_security_tests:
docker:
- image: cimg/python:3.11
auth:
username: ${DOCKERHUB_USERNAME}
password: ${DOCKERHUB_PASSWORD}
working_directory: ~/project
steps:
- checkout
- run:
name: Show git commit hash
command: |
echo "Git commit hash: $CIRCLE_SHA1"
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
pip install "pytest-cov==5.0.0"
- run:
name: Run prisma ./docker/entrypoint.sh
command: |
set +e
chmod +x docker/entrypoint.sh
./docker/entrypoint.sh
set -e
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest tests/proxy_security_tests --cov=litellm --cov-report=xml -vv -x -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
- run:
name: Rename the coverage files
command: |
mv coverage.xml litellm_proxy_security_tests_coverage.xml
mv .coverage litellm_proxy_security_tests_coverage
# Store test results
- store_test_results:
path: test-results
- persist_to_workspace:
root: .
paths:
- litellm_proxy_security_tests_coverage.xml
- litellm_proxy_security_tests_coverage
litellm_proxy_unit_testing: # Runs all tests with the "proxy", "key", "jwt" filenames
docker:
- image: cimg/python:3.11
@ -625,6 +678,50 @@ jobs:
paths:
- llm_translation_coverage.xml
- llm_translation_coverage
litellm_mapped_tests:
docker:
- image: cimg/python:3.11
auth:
username: ${DOCKERHUB_USERNAME}
password: ${DOCKERHUB_PASSWORD}
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest-mock==3.12.0"
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-cov==5.0.0"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
pip install "hypercorn==0.17.3"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/litellm --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
- run:
name: Rename the coverage files
command: |
mv coverage.xml litellm_mapped_tests_coverage.xml
mv .coverage litellm_mapped_tests_coverage
# Store test results
- store_test_results:
path: test-results
- persist_to_workspace:
root: .
paths:
- litellm_mapped_tests_coverage.xml
- litellm_mapped_tests_coverage
batches_testing:
docker:
- image: cimg/python:3.11
@ -691,6 +788,7 @@ jobs:
pip install "pytest-cov==5.0.0"
pip install "google-generativeai==0.3.2"
pip install "google-cloud-aiplatform==1.43.0"
pip install numpydoc
# Run pytest and generate JUnit XML report
- run:
name: Run tests
@ -986,21 +1084,26 @@ jobs:
pip install ruff
pip install pylint
pip install pyright
pip install beautifulsoup4
pip install .
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
- run: ruff check ./litellm
# - run: python ./tests/documentation_tests/test_general_setting_keys.py
- run: python ./tests/code_coverage_tests/check_licenses.py
- run: python ./tests/code_coverage_tests/router_code_coverage.py
- run: python ./tests/code_coverage_tests/callback_manager_test.py
- run: python ./tests/code_coverage_tests/recursive_detector.py
- run: python ./tests/code_coverage_tests/test_router_strategy_async.py
- run: python ./tests/code_coverage_tests/litellm_logging_code_coverage.py
- run: python ./tests/code_coverage_tests/bedrock_pricing.py
- run: python ./tests/documentation_tests/test_env_keys.py
- run: python ./tests/documentation_tests/test_router_settings.py
- run: python ./tests/documentation_tests/test_api_docs.py
- run: python ./tests/code_coverage_tests/ensure_async_clients_test.py
- run: python ./tests/code_coverage_tests/enforce_llms_folder_style.py
- run: python ./tests/documentation_tests/test_circular_imports.py
- run: python ./tests/code_coverage_tests/prevent_key_leaks_in_exceptions.py
- run: helm lint ./deploy/charts/litellm-helm
db_migration_disable_update_check:
@ -1010,6 +1113,23 @@ jobs:
working_directory: ~/project
steps:
- checkout
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
- run:
name: Build Docker image
command: |
@ -1017,29 +1137,48 @@ jobs:
- run:
name: Run Docker container
command: |
docker run --name my-app \
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e DISABLE_SCHEMA_UPDATE="True" \
-v $(pwd)/litellm/proxy/example_config_yaml/bad_schema.prisma:/app/schema.prisma \
-v $(pwd)/litellm/proxy/example_config_yaml/bad_schema.prisma:/app/litellm/proxy/schema.prisma \
-v $(pwd)/litellm/proxy/example_config_yaml/disable_schema_update.yaml:/app/config.yaml \
--name my-app \
myapp:latest \
--config /app/config.yaml \
--port 4000 > docker_output.log 2>&1 || true
--port 4000
- run:
name: Display Docker logs
command: cat docker_output.log
- run:
name: Check for expected error
name: Install curl and dockerize
command: |
if grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two" docker_output.log; then
echo "Expected error found. Test passed."
sudo apt-get update
sudo apt-get install -y curl
sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run:
name: Wait for container to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m
- run:
name: Check container logs for expected message
command: |
echo "=== Printing Full Container Startup Logs ==="
docker logs my-app
echo "=== End of Full Container Startup Logs ==="
if docker logs my-app 2>&1 | grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two"; then
echo "Expected message found in logs. Test passed."
else
echo "Expected error not found. Test failed."
cat docker_output.log
echo "Expected message not found in logs. Test failed."
exit 1
fi
- run:
name: Run Basic Proxy Startup Tests (Health Readiness and Chat Completion)
command: |
python -m pytest -vv tests/basic_proxy_startup_tests -x --junitxml=test-results/junit-2.xml --durations=5
no_output_timeout: 120m
build_and_test:
machine:
@ -1460,6 +1599,199 @@ jobs:
# Store test results
- store_test_results:
path: test-results
proxy_multi_instance_tests:
machine:
image: ubuntu-2204:2023.10.1
resource_class: xlarge
working_directory: ~/project
steps:
- checkout
- run:
name: Install Docker CLI (In case it's not already installed)
command: |
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-mock==3.12.0"
pip install "pytest-asyncio==0.21.1"
- run:
name: Build Docker image
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
- run:
name: Run Docker container 1
# intentionally give bad redis credentials here
# the OTEL test - should get this as a trace
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \
-e REDIS_PORT=$REDIS_PORT \
-e LITELLM_MASTER_KEY="sk-1234" \
-e LITELLM_LICENSE=$LITELLM_LICENSE \
-e USE_DDTRACE=True \
-e DD_API_KEY=$DD_API_KEY \
-e DD_SITE=$DD_SITE \
--name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--detailed_debug \
- run:
name: Run Docker container 2
command: |
docker run -d \
-p 4001:4001 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \
-e REDIS_PORT=$REDIS_PORT \
-e LITELLM_MASTER_KEY="sk-1234" \
-e LITELLM_LICENSE=$LITELLM_LICENSE \
-e USE_DDTRACE=True \
-e DD_API_KEY=$DD_API_KEY \
-e DD_SITE=$DD_SITE \
--name my-app-2 \
-v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4001 \
--detailed_debug
- run:
name: Install curl and dockerize
command: |
sudo apt-get update
sudo apt-get install -y curl
sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run:
name: Start outputting logs
command: docker logs -f my-app
background: true
- run:
name: Wait for instance 1 to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
- run:
name: Wait for instance 2 to be ready
command: dockerize -wait http://localhost:4001 -timeout 5m
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/multi_instance_e2e_tests -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout:
120m
# Clean up first container
# Store test results
- store_test_results:
path: test-results
proxy_store_model_in_db_tests:
machine:
image: ubuntu-2204:2023.10.1
resource_class: xlarge
working_directory: ~/project
steps:
- checkout
- run:
name: Install Docker CLI (In case it's not already installed)
command: |
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-mock==3.12.0"
pip install "pytest-asyncio==0.21.1"
pip install "assemblyai==0.37.0"
- run:
name: Build Docker image
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
- run:
name: Run Docker container
# intentionally give bad redis credentials here
# the OTEL test - should get this as a trace
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e STORE_MODEL_IN_DB="True" \
-e LITELLM_MASTER_KEY="sk-1234" \
-e LITELLM_LICENSE=$LITELLM_LICENSE \
--name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/store_model_db_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--detailed_debug \
- run:
name: Install curl and dockerize
command: |
sudo apt-get update
sudo apt-get install -y curl
sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run:
name: Start outputting logs
command: docker logs -f my-app
background: true
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/store_model_in_db_tests -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout:
120m
# Clean up first container
proxy_build_from_pip_tests:
# Change from docker to machine executor
machine:
@ -1590,6 +1922,7 @@ jobs:
pip install "google-cloud-aiplatform==1.43.0"
pip install aiohttp
pip install "openai==1.54.0 "
pip install "assemblyai==0.37.0"
python -m pip install --upgrade pip
pip install "pydantic==2.7.1"
pip install "pytest==7.3.1"
@ -1602,12 +1935,12 @@ jobs:
pip install prisma
pip install fastapi
pip install jsonschema
pip install "httpx==0.24.1"
pip install "httpx==0.27.0"
pip install "anyio==3.7.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
pip install "google-cloud-aiplatform==1.59.0"
pip install anthropic
pip install "anthropic==0.49.0"
# Run pytest and generate JUnit XML report
- run:
name: Build Docker image
@ -1622,6 +1955,7 @@ jobs:
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e GEMINI_API_KEY=$GEMINI_API_KEY \
-e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
-e ASSEMBLYAI_API_KEY=$ASSEMBLYAI_API_KEY \
-e USE_DDTRACE=True \
-e DD_API_KEY=$DD_API_KEY \
-e DD_SITE=$DD_SITE \
@ -1648,11 +1982,44 @@ jobs:
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
# Add Ruby installation and testing before the existing Node.js and Python tests
- run:
name: Install Ruby and Bundler
command: |
# Import GPG keys first
gpg --keyserver hkp://keyserver.ubuntu.com --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3 7D2BAF1CF37B13E2069D6956105BD0E739499BDB || {
curl -sSL https://rvm.io/mpapis.asc | gpg --import -
curl -sSL https://rvm.io/pkuczynski.asc | gpg --import -
}
# Install Ruby version manager (RVM)
curl -sSL https://get.rvm.io | bash -s stable
# Source RVM from the correct location
source $HOME/.rvm/scripts/rvm
# Install Ruby 3.2.2
rvm install 3.2.2
rvm use 3.2.2 --default
# Install latest Bundler
gem install bundler
- run:
name: Run Ruby tests
command: |
source $HOME/.rvm/scripts/rvm
cd tests/pass_through_tests/ruby_passthrough_tests
bundle install
bundle exec rspec
no_output_timeout: 30m
# New steps to run Node.js test
- run:
name: Install Node.js
command: |
export DEBIAN_FRONTEND=noninteractive
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get update
sudo apt-get install -y nodejs
node --version
npm --version
@ -1701,7 +2068,7 @@ jobs:
python -m venv venv
. venv/bin/activate
pip install coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
coverage xml
- codecov/upload:
file: ./coverage.xml
@ -1765,7 +2132,7 @@ jobs:
circleci step halt
fi
- run:
name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
name: Trigger Github Action for new Docker Container + Trigger Load Testing
command: |
echo "Install TOML package."
python3 -m pip install toml
@ -1775,9 +2142,9 @@ jobs:
-H "Accept: application/vnd.github.v3+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
-d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
-d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}-nightly\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}"
curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly"
e2e_ui_testing:
machine:
@ -1786,6 +2153,25 @@ jobs:
working_directory: ~/project
steps:
- checkout
- run:
name: Build UI
command: |
# Set up nvm
export NVM_DIR="/opt/circleci/.nvm"
source "$NVM_DIR/nvm.sh"
source "$NVM_DIR/bash_completion"
# Install and use Node version
nvm install v18.17.0
nvm use v18.17.0
cd ui/litellm-dashboard
# Install dependencies first
npm install
# Now source the build script
source ./build_ui.sh
- run:
name: Install Docker CLI (In case it's not already installed)
command: |
@ -1830,6 +2216,7 @@ jobs:
name: Install Playwright Browsers
command: |
npx playwright install
- run:
name: Build Docker image
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1958,6 +2345,12 @@ workflows:
only:
- main
- /litellm_.*/
- litellm_proxy_security_tests:
filters:
branches:
only:
- main
- /litellm_.*/
- litellm_assistants_api_testing:
filters:
branches:
@ -2006,6 +2399,18 @@ workflows:
only:
- main
- /litellm_.*/
- proxy_multi_instance_tests:
filters:
branches:
only:
- main
- /litellm_.*/
- proxy_store_model_in_db_tests:
filters:
branches:
only:
- main
- /litellm_.*/
- proxy_build_from_pip_tests:
filters:
branches:
@ -2024,6 +2429,12 @@ workflows:
only:
- main
- /litellm_.*/
- litellm_mapped_tests:
filters:
branches:
only:
- main
- /litellm_.*/
- batches_testing:
filters:
branches:
@ -2057,6 +2468,7 @@ workflows:
- upload-coverage:
requires:
- llm_translation_testing
- litellm_mapped_tests
- batches_testing
- litellm_utils_testing
- pass_through_unit_testing
@ -2065,6 +2477,7 @@ workflows:
- litellm_router_testing
- caching_unit_tests
- litellm_proxy_unit_testing
- litellm_proxy_security_tests
- langfuse_logging_unit_tests
- local_testing
- litellm_assistants_api_testing
@ -2113,6 +2526,7 @@ workflows:
- load_testing
- test_bad_database_url
- llm_translation_testing
- litellm_mapped_tests
- batches_testing
- litellm_utils_testing
- pass_through_unit_testing
@ -2126,9 +2540,12 @@ workflows:
- db_migration_disable_update_check
- e2e_ui_testing
- litellm_proxy_unit_testing
- litellm_proxy_security_tests
- installing_litellm_on_python
- installing_litellm_on_python_3_13
- proxy_logging_guardrails_model_info_tests
- proxy_multi_instance_tests
- proxy_store_model_in_db_tests
- proxy_build_from_pip_tests
- proxy_pass_through_endpoint_tests
- check_code_and_doc_quality

View file

@ -20,3 +20,8 @@ REPLICATE_API_TOKEN = ""
ANTHROPIC_API_KEY = ""
# Infisical
INFISICAL_TOKEN = ""
# Development Configs
LITELLM_MASTER_KEY = "sk-1234"
DATABASE_URL = "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB = "True"

View file

@ -6,6 +6,16 @@
<!-- e.g. "Fixes #000" -->
## Pre-Submission checklist
**Please complete all items before asking a LiteLLM maintainer to review your PR**
- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
- [ ] I have added a screenshot of my new test passing locally
- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
## Type
<!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@
## Changes
<!-- List of changes -->
## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
If UI changes, send a screenshot/GIF of working UI fixes
<!-- Test procedure -->

View file

@ -52,6 +52,41 @@ def interpret_results(csv_file):
return markdown_table
def _get_docker_run_command_stable_release(release_version):
return f"""
\n\n
## Docker Run LiteLLM Proxy
```
docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
ghcr.io/berriai/litellm:litellm_stable_release_branch-{release_version}
```
"""
def _get_docker_run_command(release_version):
return f"""
\n\n
## Docker Run LiteLLM Proxy
```
docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
ghcr.io/berriai/litellm:main-{release_version}
```
"""
def get_docker_run_command(release_version):
if "stable" in release_version:
return _get_docker_run_command_stable_release(release_version)
else:
return _get_docker_run_command(release_version)
if __name__ == "__main__":
csv_file = "load_test_stats.csv" # Change this to the path of your CSV file
markdown_table = interpret_results(csv_file)
@ -79,17 +114,7 @@ if __name__ == "__main__":
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
existing_release_body = latest_release.body[:start_index]
docker_run_command = f"""
\n\n
## Docker Run LiteLLM Proxy
```
docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
ghcr.io/berriai/litellm:main-{release_version}
```
"""
docker_run_command = get_docker_run_command(release_version)
print("docker run command: ", docker_run_command)
new_release_body = (

View file

@ -8,7 +8,7 @@ class MyUser(HttpUser):
def chat_completion(self):
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer sk-ZoHqrLIs2-5PzJrqBaviAA",
"Authorization": "Bearer sk-8N1tLOOyH8TIxwOLahhIVg",
# Include any additional headers you may need for authentication, etc.
}

20
.github/workflows/stale.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: "Stale Issue Management"
on:
schedule:
- cron: '0 0 * * *' # Runs daily at midnight UTC
workflow_dispatch:
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v8
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"
stale-issue-message: "This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs."
stale-pr-message: "This pull request has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs."
days-before-stale: 90 # Revert to 60 days
days-before-close: 7 # Revert to 7 days
stale-issue-label: "stale"
operations-per-run: 1000

10
.gitignore vendored
View file

@ -48,7 +48,7 @@ deploy/charts/litellm/charts/*
deploy/charts/*.tgz
litellm/proxy/vertex_key.json
**/.vim/
/node_modules
**/node_modules
kub.yaml
loadtest_kub.yaml
litellm/proxy/_new_secret_config.yaml
@ -71,3 +71,11 @@ tests/local_testing/log.txt
.codegpt
litellm/proxy/_new_new_secret_config.yaml
litellm/proxy/custom_guardrail.py
litellm/proxy/_experimental/out/404.html
litellm/proxy/_experimental/out/404.html
litellm/proxy/_experimental/out/model_hub.html
.mypy_cache/*
litellm/proxy/application.log
tests/llm_translation/vertex_test_account.json
tests/llm_translation/test_vertex_key.json

View file

@ -22,7 +22,7 @@ repos:
rev: 7.0.0 # The version of flake8 to use
hooks:
- id: flake8
exclude: ^litellm/tests/|^litellm/proxy/tests/
exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/
additional_dependencies: [flake8-print]
files: litellm/.*\.py
# - id: flake8

21
Makefile Normal file
View file

@ -0,0 +1,21 @@
# LiteLLM Makefile
# Simple Makefile for running tests and basic development tasks
.PHONY: help test test-unit test-integration
# Default target
help:
@echo "Available commands:"
@echo " make test - Run all tests"
@echo " make test-unit - Run unit tests"
@echo " make test-integration - Run integration tests"
# Testing
test:
poetry run pytest tests/
test-unit:
poetry run pytest tests/litellm/
test-integration:
poetry run pytest tests/ -k "not litellm"

View file

@ -40,7 +40,7 @@ LiteLLM manages:
[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published.
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
@ -64,7 +64,7 @@ import os
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["ANTHROPIC_API_KEY"] = "your-cohere-key"
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key"
messages = [{ "content": "Hello, how are you?","role": "user"}]
@ -187,13 +187,13 @@ os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["ATHINA_API_KEY"] = "your-athina-api-key"
os.environ["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = "your-openai-key"
# set callbacks
litellm.success_callback = ["lunary", "mlflow", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc
#openai call
response = completion(model="anthropic/claude-3-sonnet-20240229", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
response = completion(model="openai/gpt-4o", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
```
# LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
@ -303,6 +303,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
|-------------------------------------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------|
| [openai](https://docs.litellm.ai/docs/providers/openai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [azure](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [AI/ML API](https://docs.litellm.ai/docs/providers/aiml) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | ✅ | ✅ | |
| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock) | ✅ | ✅ | ✅ | ✅ | ✅ | |
| [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
@ -339,64 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
## Contributing
To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
Here's how to modify the repo locally:
Step 1: Clone the repo
```
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Navigate into the project, and install dependencies:
```
cd litellm
poetry install -E extra_proxy -E proxy
```
Step 3: Test your change:
```
cd tests # pwd: Documents/litellm/litellm/tests
poetry run flake8
poetry run pytest .
```
Step 4: Submit a PR with your changes! 🚀
- push your fork to your GitHub repo
- submit a PR from there
### Building LiteLLM Docker Image
Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
Step 1: Clone the repo
```
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Build the Docker Image
Build using Dockerfile.non_root
```
docker build -f docker/Dockerfile.non_root -t litellm_test_image .
```
Step 3: Run the Docker Image
Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
```
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://xxxxxxxx" \
-e LITELLM_MASTER_KEY="sk-1234" \
-p 4000:4000 \
litellm_test_image \
--config /app/config.yaml --detailed_debug
```
Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)
# Enterprise
For companies that need better security, user management and professional support
@ -450,3 +394,20 @@ If you have suggestions on how to improve the code quality feel free to open an
<a href="https://github.com/BerriAI/litellm/graphs/contributors">
<img src="https://contrib.rocks/image?repo=BerriAI/litellm" />
</a>
## Run in Developer mode
### Services
1. Setup .env file in root
2. Run dependant services `docker-compose up db prometheus`
### Backend
1. (In root) create virtual environment `python -m venv .venv`
2. Activate virtual environment `source .venv/bin/activate`
3. Install dependencies `pip install -e ".[all]"`
4. Start proxy backend `uvicorn litellm.proxy.proxy_server:app --host localhost --port 4000 --reload`
### Frontend
1. Navigate to `ui/litellm-dashboard`
2. Install dependencies `npm install`
3. Run `npm run dev` to start the dashboard

View file

@ -0,0 +1,172 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "4FbDOmcj2VkM"
},
"source": [
"## Use LiteLLM with Arize\n",
"https://docs.litellm.ai/docs/observability/arize_integration\n",
"\n",
"This method uses the litellm proxy to send the data to Arize. The callback is set in the litellm config below, instead of using OpenInference tracing."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "21W8Woog26Ns"
},
"source": [
"## Install Dependencies"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "xrjKLBxhxu2L"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: litellm in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (1.54.1)\n",
"Requirement already satisfied: aiohttp in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.11.10)\n",
"Requirement already satisfied: click in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.1.7)\n",
"Requirement already satisfied: httpx<0.28.0,>=0.23.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.27.2)\n",
"Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.5.0)\n",
"Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.1.4)\n",
"Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (4.23.0)\n",
"Requirement already satisfied: openai>=1.55.3 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.57.1)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.10.3)\n",
"Requirement already satisfied: python-dotenv>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.0.1)\n",
"Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.32.3)\n",
"Requirement already satisfied: tiktoken>=0.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.7.0)\n",
"Requirement already satisfied: tokenizers in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.21.0)\n",
"Requirement already satisfied: anyio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (4.7.0)\n",
"Requirement already satisfied: certifi in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (2024.8.30)\n",
"Requirement already satisfied: httpcore==1.* in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.0.7)\n",
"Requirement already satisfied: idna in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (3.10)\n",
"Requirement already satisfied: sniffio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.3.1)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.23.0->litellm) (0.14.0)\n",
"Requirement already satisfied: zipp>=3.20 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm) (3.21.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm) (3.0.2)\n",
"Requirement already satisfied: attrs>=22.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (24.2.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (2024.10.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.35.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.22.3)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (1.9.0)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (0.6.1)\n",
"Requirement already satisfied: tqdm>4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.67.1)\n",
"Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.12.2)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (2.27.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (3.4.0)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (2.0.7)\n",
"Requirement already satisfied: regex>=2022.1.18 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm) (2024.11.6)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (2.4.4)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.3.1)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.18.3)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tokenizers->litellm) (0.26.5)\n",
"Requirement already satisfied: filelock in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (3.16.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (2024.10.0)\n",
"Requirement already satisfied: packaging>=20.9 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (6.0.2)\n"
]
}
],
"source": [
"!pip install litellm"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jHEu-TjZ29PJ"
},
"source": [
"## Set Env Variables"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "QWd9rTysxsWO"
},
"outputs": [],
"source": [
"import litellm\n",
"import os\n",
"from getpass import getpass\n",
"\n",
"os.environ[\"ARIZE_SPACE_KEY\"] = getpass(\"Enter your Arize space key: \")\n",
"os.environ[\"ARIZE_API_KEY\"] = getpass(\"Enter your Arize API key: \")\n",
"os.environ['OPENAI_API_KEY']= getpass(\"Enter your OpenAI API key: \")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's run a completion call and see the traces in Arize"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hello! Nice to meet you, OpenAI. How can I assist you today?\n"
]
}
],
"source": [
"# set arize as a callback, litellm will send the data to arize\n",
"litellm.callbacks = [\"arize\"]\n",
" \n",
"# openai call\n",
"response = litellm.completion(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"Hi 👋 - i'm openai\"}\n",
" ]\n",
")\n",
"print(response.choices[0].message.content)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View file

@ -0,0 +1,252 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LLM Ops Stack - LiteLLM Proxy + Langfuse \n",
"\n",
"This notebook demonstrates how to use LiteLLM Proxy with Langfuse \n",
"- Use LiteLLM Proxy for calling 100+ LLMs in OpenAI format\n",
"- Use Langfuse for viewing request / response traces \n",
"\n",
"\n",
"In this notebook we will setup LiteLLM Proxy to make requests to OpenAI, Anthropic, Bedrock and automatically log traces to Langfuse."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Setup LiteLLM Proxy\n",
"\n",
"### 1.1 Define .env variables \n",
"Define .env variables on the container that litellm proxy is running on.\n",
"```bash\n",
"## LLM API Keys\n",
"OPENAI_API_KEY=sk-proj-1234567890\n",
"ANTHROPIC_API_KEY=sk-ant-api03-1234567890\n",
"AWS_ACCESS_KEY_ID=1234567890\n",
"AWS_SECRET_ACCESS_KEY=1234567890\n",
"\n",
"## Langfuse Logging \n",
"LANGFUSE_PUBLIC_KEY=\"pk-lf-xxxx9\"\n",
"LANGFUSE_SECRET_KEY=\"sk-lf-xxxx9\"\n",
"LANGFUSE_HOST=\"https://us.cloud.langfuse.com\"\n",
"```\n",
"\n",
"\n",
"### 1.1 Setup LiteLLM Proxy Config yaml \n",
"```yaml\n",
"model_list:\n",
" - model_name: gpt-4o\n",
" litellm_params:\n",
" model: openai/gpt-4o\n",
" api_key: os.environ/OPENAI_API_KEY\n",
" - model_name: claude-3-5-sonnet-20241022\n",
" litellm_params:\n",
" model: anthropic/claude-3-5-sonnet-20241022\n",
" api_key: os.environ/ANTHROPIC_API_KEY\n",
" - model_name: us.amazon.nova-micro-v1:0\n",
" litellm_params:\n",
" model: bedrock/us.amazon.nova-micro-v1:0\n",
" aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID\n",
" aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY\n",
"\n",
"litellm_settings:\n",
" callbacks: [\"langfuse\"]\n",
"\n",
"\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Make LLM Requests to LiteLLM Proxy\n",
"\n",
"Now we will make our first LLM request to LiteLLM Proxy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1 Setup Client Side Variables to point to LiteLLM Proxy\n",
"Set `LITELLM_PROXY_BASE_URL` to the base url of the LiteLLM Proxy and `LITELLM_VIRTUAL_KEY` to the virtual key you want to use for Authentication to LiteLLM Proxy. (Note: In this initial setup you can)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"\n",
"LITELLM_PROXY_BASE_URL=\"http://0.0.0.0:4000\"\n",
"LITELLM_VIRTUAL_KEY=\"sk-oXXRa1xxxxxxxxxxx\""
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ChatCompletion(id='chatcmpl-B0sq6QkOKNMJ0dwP3x7OoMqk1jZcI', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Langfuse is a platform designed to monitor, observe, and troubleshoot AI and large language model (LLM) applications. It provides features that help developers gain insights into how their AI systems are performing, make debugging easier, and optimize the deployment of models. Langfuse allows for tracking of model interactions, collecting telemetry, and visualizing data, which is crucial for understanding the behavior of AI models in production environments. This kind of tool is particularly useful for developers working with language models who need to ensure reliability and efficiency in their applications.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739550502, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_523b9b6e5f', usage=CompletionUsage(completion_tokens=109, prompt_tokens=13, total_tokens=122, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import openai\n",
"client = openai.OpenAI(\n",
" api_key=LITELLM_VIRTUAL_KEY,\n",
" base_url=LITELLM_PROXY_BASE_URL\n",
")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"what is Langfuse?\"\n",
" }\n",
" ],\n",
")\n",
"\n",
"response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 View Traces on Langfuse\n",
"LiteLLM will send the request / response, model, tokens (input + output), cost to Langfuse.\n",
"\n",
"![image_description](litellm_proxy_langfuse.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.4 Call Anthropic, Bedrock models \n",
"\n",
"Now we can call `us.amazon.nova-micro-v1:0` and `claude-3-5-sonnet-20241022` models defined on your config.yaml both in the OpenAI request / response format."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ChatCompletion(id='chatcmpl-7756e509-e61f-4f5e-b5ae-b7a41013522a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability tool designed specifically for machine learning models and applications built with natural language processing (NLP) and large language models (LLMs). It focuses on providing detailed insights into how these models perform in real-world scenarios. Here are some key features and purposes of Langfuse:\\n\\n1. **Real-time Monitoring**: Langfuse allows developers to monitor the performance of their NLP and LLM applications in real time. This includes tracking the inputs and outputs of the models, as well as any errors or issues that arise during operation.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the models' outputs. By analyzing incorrect or unexpected responses, developers can pinpoint where and why errors occur, facilitating more effective debugging and improvement.\\n\\n3. **Performance Metrics**: Langfuse provides various performance metrics, such as latency, throughput, and error rates. These metrics help developers understand how well their models are performing under different conditions and workloads.\\n\\n4. **Traceability**: It offers detailed traceability of requests and responses, allowing developers to follow the path of a request through the system and see how it is processed by the model at each step.\\n\\n5. **User Feedback Integration**: Langfuse can integrate user feedback to provide context for model outputs. This helps in understanding how real users are interacting with the model and how its outputs align with user expectations.\\n\\n6. **Customizable Dashboards**: Users can create custom dashboards to visualize the data collected by Langfuse. These dashboards can be tailored to highlight the most important metrics and insights for a specific application or team.\\n\\n7. **Alerting and Notifications**: It can set up alerts for specific conditions or errors, notifying developers when something goes wrong or when performance metrics fall outside of acceptable ranges.\\n\\nBy providing comprehensive observability for NLP and LLM applications, Langfuse helps developers to build more reliable, accurate, and user-friendly models and services.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554005, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=380, prompt_tokens=5, total_tokens=385, completion_tokens_details=None, prompt_tokens_details=None))"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import openai\n",
"client = openai.OpenAI(\n",
" api_key=LITELLM_VIRTUAL_KEY,\n",
" base_url=LITELLM_PROXY_BASE_URL\n",
")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"us.amazon.nova-micro-v1:0\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"what is Langfuse?\"\n",
" }\n",
" ],\n",
")\n",
"\n",
"response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Advanced - Set Langfuse Trace ID, Tags, Metadata \n",
"\n",
"Here is an example of how you can set Langfuse specific params on your client side request. See full list of supported langfuse params [here](https://docs.litellm.ai/docs/observability/langfuse_integration)\n",
"\n",
"You can view the logged trace of this request [here](https://us.cloud.langfuse.com/project/clvlhdfat0007vwb74m9lvfvi/traces/567890?timestamp=2025-02-14T17%3A30%3A26.709Z)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ChatCompletion(id='chatcmpl-789babd5-c064-4939-9093-46e4cd2e208a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability platform designed specifically for monitoring and improving the performance of natural language processing (NLP) models and applications. It provides developers with tools to track, analyze, and optimize how their language models interact with users and handle natural language inputs.\\n\\nHere are some key features and benefits of Langfuse:\\n\\n1. **Real-Time Monitoring**: Langfuse allows developers to monitor their NLP applications in real time. This includes tracking user interactions, model responses, and overall performance metrics.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the model's responses. This can include incorrect, irrelevant, or unsafe outputs.\\n\\n3. **User Feedback Integration**: Langfuse enables the collection of user feedback directly within the platform. This feedback can be used to identify areas for improvement in the model's performance.\\n\\n4. **Performance Metrics**: The platform provides detailed metrics and analytics on model performance, including latency, throughput, and accuracy.\\n\\n5. **Alerts and Notifications**: Developers can set up alerts to notify them of any significant issues or anomalies in model performance.\\n\\n6. **Debugging Tools**: Langfuse offers tools to help developers debug and refine their models by providing insights into how the model processes different types of inputs.\\n\\n7. **Integration with Development Workflows**: It integrates seamlessly with various development environments and CI/CD pipelines, making it easier to incorporate observability into the development process.\\n\\n8. **Customizable Dashboards**: Users can create custom dashboards to visualize the data in a way that best suits their needs.\\n\\nLangfuse aims to help developers build more reliable, accurate, and user-friendly NLP applications by providing them with the tools to observe and improve how their models perform in real-world scenarios.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554281, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=346, prompt_tokens=5, total_tokens=351, completion_tokens_details=None, prompt_tokens_details=None))"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import openai\n",
"client = openai.OpenAI(\n",
" api_key=LITELLM_VIRTUAL_KEY,\n",
" base_url=LITELLM_PROXY_BASE_URL\n",
")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"us.amazon.nova-micro-v1:0\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"what is Langfuse?\"\n",
" }\n",
" ],\n",
" extra_body={\n",
" \"metadata\": {\n",
" \"generation_id\": \"1234567890\",\n",
" \"trace_id\": \"567890\",\n",
" \"trace_user_id\": \"user_1234567890\",\n",
" \"tags\": [\"tag1\", \"tag2\"]\n",
" }\n",
" }\n",
")\n",
"\n",
"response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## "
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 308 KiB

View file

@ -168,11 +168,11 @@ async def check_view_exists(): # noqa: PLR0915
print("MonthlyGlobalSpendPerUserPerKey Created!") # noqa
try:
await db.query_raw("""SELECT 1 FROM DailyTagSpend LIMIT 1""")
await db.query_raw("""SELECT 1 FROM "DailyTagSpend" LIMIT 1""")
print("DailyTagSpend Exists!") # noqa
except Exception:
sql_query = """
CREATE OR REPLACE VIEW DailyTagSpend AS
CREATE OR REPLACE VIEW "DailyTagSpend" AS
SELECT
jsonb_array_elements_text(request_tags) AS individual_request_tag,
DATE(s."startTime") AS spend_date,

View file

@ -18,7 +18,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.0
version: 0.4.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

View file

@ -48,6 +48,23 @@ spec:
{{- end }}
- name: DISABLE_SCHEMA_UPDATE
value: "false" # always run the migration from the Helm PreSync hook, override the value set
{{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
restartPolicy: OnFailure
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }}
backoffLimit: {{ .Values.migrationJob.backoffLimit }}
{{- end }}

View file

@ -187,6 +187,7 @@ migrationJob:
backoffLimit: 4 # Backoff limit for Job restarts
disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0.
annotations: {}
ttlSecondsAfterFinished: 120
# Additional environment variables to be added to the deployment
envVars: {

View file

@ -29,6 +29,8 @@ services:
POSTGRES_DB: litellm
POSTGRES_USER: llmproxy
POSTGRES_PASSWORD: dbpassword9090
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
interval: 1s

View file

@ -11,9 +11,7 @@ FROM $LITELLM_BUILD_IMAGE AS builder
WORKDIR /app
# Install build dependencies
RUN apk update && \
apk add --no-cache gcc python3-dev musl-dev && \
rm -rf /var/cache/apk/*
RUN apk add --no-cache gcc python3-dev musl-dev
RUN pip install --upgrade pip && \
pip install build

View file

@ -0,0 +1,92 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] `/v1/messages`
LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint.
This currently just supports the Anthropic API.
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ✅ | |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ✅ | |
| Streaming | ✅ | |
| Fallbacks | ✅ | between anthropic models |
| Loadbalancing | ✅ | between anthropic models |
Planned improvement:
- Vertex AI Anthropic support
- Bedrock Anthropic support
## Usage
<Tabs>
<TabItem label="PROXY" value="proxy">
1. Setup config.yaml
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: claude-3-7-sonnet-latest
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
-H 'content-type: application/json' \
-H 'x-api-key: $LITELLM_API_KEY' \
-H 'anthropic-version: 2023-06-01' \
-d '{
"model": "anthropic-claude",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "List 5 important events in the XIX century"
}
]
}
],
"max_tokens": 4096
}'
```
</TabItem>
<TabItem value="sdk" label="SDK">
```python
from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
import asyncio
import os
# set env
os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
# Call the handler
async def call():
response = await anthropic_messages(
messages=messages,
api_key=api_key,
model="claude-3-haiku-20240307",
max_tokens=100,
)
asyncio.run(call())
```
</TabItem>
</Tabs>

View file

@ -8,6 +8,7 @@ Use `litellm.supports_function_calling(model="")` -> returns `True` if model sup
assert litellm.supports_function_calling(model="gpt-3.5-turbo") == True
assert litellm.supports_function_calling(model="azure/gpt-4-1106-preview") == True
assert litellm.supports_function_calling(model="palm/chat-bison") == False
assert litellm.supports_function_calling(model="xai/grok-2-latest") == True
assert litellm.supports_function_calling(model="ollama/llama2") == False
```

View file

@ -44,6 +44,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | | ✅ | ✅ | | | ✅ |
|OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
|Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ |
|xAI| ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
|Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |
|Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |

View file

@ -89,6 +89,7 @@ response_format: { "type": "json_schema", "json_schema": … , "strict": true }
Works for:
- OpenAI models
- Azure OpenAI models
- xAI models (Grok-2 or later)
- Google AI Studio - Gemini models
- Vertex AI models (Gemini + Anthropic)
- Bedrock Models

View file

@ -46,7 +46,7 @@ from litellm import completion
fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=fallback_dict)
```
### Fallbacks - Switch Models/API Keys/API Bases (SDK)

View file

@ -118,9 +118,11 @@ response = client.chat.completions.create(
Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vision` and `False` if not
```python
assert litellm.supports_vision(model="gpt-4-vision-preview") == True
assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True
assert litellm.supports_vision(model="gpt-3.5-turbo") == False
assert litellm.supports_vision(model="openai/gpt-4-vision-preview") == True
assert litellm.supports_vision(model="vertex_ai/gemini-1.0-pro-vision") == True
assert litellm.supports_vision(model="openai/gpt-3.5-turbo") == False
assert litellm.supports_vision(model="xai/grok-2-vision-latest") == True
assert litellm.supports_vision(model="xai/grok-2-latest") == False
```
</TabItem>
@ -188,3 +190,137 @@ Expected Response
</TabItem>
</Tabs>
## Explicitly specify image type
If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param.
```python
"image_url": {
"url": "gs://my-gs-image",
"format": "image/jpeg"
}
```
LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai).
For others (e.g. openai), it will be ignored.
<Tabs>
<TabItem label="SDK" value="sdk">
```python
import os
from litellm import completion
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "claude-3-7-sonnet-latest",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
<TabItem label="PROXY" value="proxy">
1. Define vision models on config.yaml
```yaml
model_list:
- model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
litellm_params:
model: openai/gpt-4-vision-preview
api_key: os.environ/OPENAI_API_KEY
- model_name: llava-hf # Custom OpenAI compatible model
litellm_params:
model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
api_base: http://localhost:8000
api_key: fake-key
model_info:
supports_vision: True # set supports_vision to True so /model/info returns this attribute as True
```
2. Run proxy server
```bash
litellm --config config.yaml
```
3. Test it using the OpenAI Python SDK
```python
import os
from openai import OpenAI
client = OpenAI(
api_key="sk-1234", # your litellm proxy api key
)
response = client.chat.completions.create(
model = "gpt-4-vision-preview", # use model="llava-hf" to test your custom OpenAI endpoint
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
</Tabs>
## Spec
```
"image_url": str
OR
"image_url": {
"url": "url OR base64 encoded str",
"detail": "openai-only param",
"format": "specify mime-type of image"
}
```

View file

@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
|-------------------|-------------------------------------------------------------------------------------------------|
| SOC 2 Type I | Certified. Report available upon request on Enterprise plan. |
| SOC 2 Type II | In progress. Certificate available by April 15th, 2025 |
| ISO27001 | In progress. Certificate available by February 7th, 2025 |
| ISO 27001 | Certified. Report available upon request on Enterprise |
## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
Has the Vendor been audited / certified?
- SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
- SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025.
- ISO 27001. Certified. Report available upon request on Enterprise plan.
Has an information security management system been implemented?
- Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.

View file

@ -1,5 +1,5 @@
# Local Debugging
There's 2 ways to do local debugging - `litellm.set_verbose=True` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `set_verbose` in production. It logs API keys, which might end up in log files.
There's 2 ways to do local debugging - `litellm._turn_on_debug()` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `_turn_on_debug()` in production. It logs API keys, which might end up in log files.
## Set Verbose
@ -8,7 +8,7 @@ This is good for getting print statements for everything litellm is doing.
import litellm
from litellm import completion
litellm.set_verbose=True # 👈 this is the 1-line change you need to make
litellm._turn_on_debug() # 👈 this is the 1-line change you need to make
## set ENV variables
os.environ["OPENAI_API_KEY"] = "openai key"

View file

@ -323,6 +323,40 @@ response = embedding(
| embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
| embed-multilingual-v2.0 | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
## NVIDIA NIM Embedding Models
### API keys
This can be set as env variables or passed as **params to litellm.embedding()**
```python
import os
os.environ["NVIDIA_NIM_API_KEY"] = "" # api key
os.environ["NVIDIA_NIM_API_BASE"] = "" # nim endpoint url
```
### Usage
```python
from litellm import embedding
import os
os.environ['NVIDIA_NIM_API_KEY'] = ""
response = embedding(
model='nvidia_nim/<model_name>',
input=["good morning from litellm"]
)
```
All models listed [here](https://build.nvidia.com/explore/retrieval) are supported:
| Model Name | Function Call |
| :--- | :--- |
| NV-Embed-QA | `embedding(model="nvidia_nim/NV-Embed-QA", input)` |
| nvidia/nv-embed-v1 | `embedding(model="nvidia_nim/nvidia/nv-embed-v1", input)` |
| nvidia/nv-embedqa-mistral-7b-v2 | `embedding(model="nvidia_nim/nvidia/nv-embedqa-mistral-7b-v2", input)` |
| nvidia/nv-embedqa-e5-v5 | `embedding(model="nvidia_nim/nvidia/nv-embedqa-e5-v5", input)` |
| nvidia/embed-qa-4 | `embedding(model="nvidia_nim/nvidia/embed-qa-4", input)` |
| nvidia/llama-3.2-nv-embedqa-1b-v1 | `embedding(model="nvidia_nim/nvidia/llama-3.2-nv-embedqa-1b-v1", input)` |
| nvidia/llama-3.2-nv-embedqa-1b-v2 | `embedding(model="nvidia_nim/nvidia/llama-3.2-nv-embedqa-1b-v2", input)` |
| snowflake/arctic-embed-l | `embedding(model="nvidia_nim/snowflake/arctic-embed-l", input)` |
| baai/bge-m3 | `embedding(model="nvidia_nim/baai/bge-m3", input)` |
## HuggingFace Embedding Models
LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction

View file

@ -0,0 +1,96 @@
# Contributing Code
## **Checklist before submitting a PR**
Here are the core requirements for any PR submitted to LiteLLM
- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
- [ ] Ensure your PR passes the following tests:
- [ ] [Unit Tests](#3-running-unit-tests)
- [ ] Formatting / Linting Tests
- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
## Quick start
## 1. Setup your local dev environment
Here's how to modify the repo locally:
Step 1: Clone the repo
```shell
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Install dev dependencies:
```shell
poetry install --with dev --extras proxy
```
That's it, your local dev environment is ready!
## 2. Adding Testing to your PR
- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
- Do not add real llm api calls to this directory.
### 2.1 File Naming Convention for `tests/litellm/`
The `tests/litellm/` directory follows the same directory structure as `litellm/`.
- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
- `test_{filename}.py` maps to `litellm/{filename}.py`
## 3. Running Unit Tests
run the following command on the root of the litellm directory
```shell
make test-unit
```
## 4. Submit a PR with your changes!
- push your fork to your GitHub repo
- submit a PR from there
## Advanced
### Building LiteLLM Docker Image
Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
Step 1: Clone the repo
```shell
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Build the Docker Image
Build using Dockerfile.non_root
```shell
docker build -f docker/Dockerfile.non_root -t litellm_test_image .
```
Step 3: Run the Docker Image
Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
```shell
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://xxxxxxxx" \
-e LITELLM_MASTER_KEY="sk-1234" \
-p 4000:4000 \
litellm_test_image \
--config /app/config.yaml --detailed_debug
```

View file

@ -0,0 +1,31 @@
# [BETA] Image Variations
OpenAI's `/image/variations` endpoint is now supported.
## Quick Start
```python
from litellm import image_variation
import os
# set env vars
os.environ["OPENAI_API_KEY"] = ""
os.environ["TOPAZ_API_KEY"] = ""
# openai call
response = image_variation(
model="dall-e-2", image=image_url
)
# topaz call
response = image_variation(
model="topaz/Standard V2", image=image_url
)
print(response)
```
## Supported Providers
- OpenAI
- Topaz

View file

@ -89,7 +89,21 @@ response = completion(
```
</TabItem>
<TabItem value="xai" label="xAI">
```python
from litellm import completion
import os
## set ENV variables
os.environ["XAI_API_KEY"] = "your-api-key"
response = completion(
model="xai/grok-2-latest",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
</TabItem>
<TabItem value="vertex" label="VertexAI">
```python
@ -108,6 +122,24 @@ response = completion(
</TabItem>
<TabItem value="nvidia" label="NVIDIA">
```python
from litellm import completion
import os
## set ENV variables
os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
response = completion(
model="nvidia_nim/<model_name>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
</TabItem>
<TabItem value="hugging" label="HuggingFace">
```python
@ -254,7 +286,22 @@ response = completion(
```
</TabItem>
<TabItem value="xai" label="xAI">
```python
from litellm import completion
import os
## set ENV variables
os.environ["XAI_API_KEY"] = "your-api-key"
response = completion(
model="xai/grok-2-latest",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
```
</TabItem>
<TabItem value="vertex" label="VertexAI">
```python
@ -274,6 +321,24 @@ response = completion(
</TabItem>
<TabItem value="nvidia" label="NVIDIA">
```python
from litellm import completion
import os
## set ENV variables
os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
response = completion(
model="nvidia_nim/<model_name>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
stream=True,
)
```
</TabItem>
<TabItem value="hugging" label="HuggingFace">
```python

View file

@ -19,6 +19,7 @@ Make an account on [Arize AI](https://app.arize.com/auth/login)
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with arize
You can also use the instrumentor option instead of the callback, which you can find [here](https://docs.arize.com/arize/llm-tracing/tracing-integrations-auto/litellm).
```python
litellm.callbacks = ["arize"]
@ -28,7 +29,7 @@ import litellm
import os
os.environ["ARIZE_SPACE_KEY"] = ""
os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
os.environ["ARIZE_API_KEY"] = ""
# LLM API Keys
os.environ['OPENAI_API_KEY']=""

View file

@ -78,7 +78,10 @@ Following are the allowed fields in metadata, their types, and their description
* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
* `user_feedback: Optional[str]` - The end users feedback.
* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
* `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.
## Using a self hosted deployment of Athina

View file

@ -20,8 +20,6 @@ class MyCustomHandler(CustomLogger):
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
print(f"Post-API Call")
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
@ -31,9 +29,6 @@ class MyCustomHandler(CustomLogger):
#### ASYNC #### - for acompletion/aembeddings
async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Async Streaming")
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Async Success")
@ -127,8 +122,7 @@ from litellm import acompletion
class MyCustomHandler(CustomLogger):
#### ASYNC ####
async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Async Streaming")
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Async Success")

View file

@ -1,3 +1,5 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import Image from '@theme/IdealImage';
# Comet Opik - Logging + Evals
@ -21,17 +23,16 @@ Use just 4 lines of code, to instantly log your responses **across all providers
Get your Opik API Key by signing up [here](https://www.comet.com/signup?utm_source=litelllm&utm_medium=docs&utm_content=api_key_cell)!
```python
from litellm.integrations.opik.opik import OpikLogger
import litellm
opik_logger = OpikLogger()
litellm.callbacks = [opik_logger]
litellm.callbacks = ["opik"]
```
Full examples:
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm.integrations.opik.opik import OpikLogger
import litellm
import os
@ -43,8 +44,7 @@ os.environ["OPIK_WORKSPACE"] = ""
os.environ["OPENAI_API_KEY"] = ""
# set "opik" as a callback, litellm will send the data to an Opik server (such as comet.com)
opik_logger = OpikLogger()
litellm.callbacks = [opik_logger]
litellm.callbacks = ["opik"]
# openai call
response = litellm.completion(
@ -55,18 +55,16 @@ response = litellm.completion(
)
```
If you are liteLLM within a function tracked using Opik's `@track` decorator,
If you are using liteLLM within a function tracked using Opik's `@track` decorator,
you will need provide the `current_span_data` field in the metadata attribute
so that the LLM call is assigned to the correct trace:
```python
from opik import track
from opik.opik_context import get_current_span_data
from litellm.integrations.opik.opik import OpikLogger
import litellm
opik_logger = OpikLogger()
litellm.callbacks = [opik_logger]
litellm.callbacks = ["opik"]
@track()
def streaming_function(input):
@ -87,6 +85,126 @@ response = streaming_function("Why is tracking and evaluation of LLMs important?
chunks = list(response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo-testing
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
callbacks: ["opik"]
environment_variables:
OPIK_API_KEY: ""
OPIK_WORKSPACE: ""
```
2. Run proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-3.5-turbo-testing",
"messages": [
{
"role": "user",
"content": "What's the weather like in Boston today?"
}
]
}'
```
</TabItem>
</Tabs>
## Opik-Specific Parameters
These can be passed inside metadata with the `opik` key.
### Fields
- `project_name` - Name of the Opik project to send data to.
- `current_span_data` - The current span data to be used for tracing.
- `tags` - Tags to be used for tracing.
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from opik import track
from opik.opik_context import get_current_span_data
import litellm
litellm.callbacks = ["opik"]
messages = [{"role": "user", "content": input}]
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
metadata = {
"opik": {
"current_span_data": get_current_span_data(),
"tags": ["streaming-test"],
},
}
)
return response
```
</TabItem>
<TabItem value="proxy" label="Proxy">
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-3.5-turbo-testing",
"messages": [
{
"role": "user",
"content": "What's the weather like in Boston today?"
}
],
"metadata": {
"opik": {
"current_span_data": "...",
"tags": ["streaming-test"],
},
}
}'
```
</TabItem>
</Tabs>
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)

View file

@ -0,0 +1,75 @@
import Image from '@theme/IdealImage';
# Phoenix OSS
Open source tracing and evaluation platform
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
## Pre-Requisites
Make an account on [Phoenix OSS](https://phoenix.arize.com)
OR self-host your own instance of [Phoenix](https://docs.arize.com/phoenix/deployment)
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with Phoenix
You can also use the instrumentor option instead of the callback, which you can find [here](https://docs.arize.com/phoenix/tracing/integrations-tracing/litellm).
```python
litellm.callbacks = ["arize_phoenix"]
```
```python
import litellm
import os
os.environ["PHOENIX_API_KEY"] = "" # Necessary only using Phoenix Cloud
os.environ["PHOENIX_COLLECTOR_HTTP_ENDPOINT"] = "" # The URL of your Phoenix OSS instance
# This defaults to https://app.phoenix.arize.com/v1/traces for Phoenix Cloud
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set arize as a callback, litellm will send the data to arize
litellm.callbacks = ["phoenix"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
### Using with LiteLLM Proxy
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize_phoenix"]
environment_variables:
PHOENIX_API_KEY: "d0*****"
PHOENIX_COLLECTOR_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the GRPC endpoint
PHOENIX_COLLECTOR_HTTP_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the HTTP endpoint
```
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai

View file

@ -0,0 +1,85 @@
# Assembly AI
Pass-through endpoints for Assembly AI - call Assembly AI endpoints, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ✅ | works across all integrations |
| Logging | ✅ | works across all integrations |
Supports **ALL** Assembly AI Endpoints
[**See All Assembly AI Endpoints**](https://www.assemblyai.com/docs/api-reference)
<iframe width="840" height="500" src="https://www.loom.com/embed/aac3f4d74592448992254bfa79b9f62d?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
## Quick Start
Let's call the Assembly AI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
1. Add Assembly AI API Key to your environment
```bash
export ASSEMBLYAI_API_KEY=""
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Assembly AI `/v2/transcripts` endpoint
```python
import assemblyai as aai
LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
aai.settings.base_url = LITELLM_PROXY_BASE_URL
# URL of the file to transcribe
FILE_URL = "https://assembly.ai/wildfires.mp3"
# You can also transcribe a local file by passing in a file path
# FILE_URL = './path/to/file.mp3'
transcriber = aai.Transcriber()
transcript = transcriber.transcribe(FILE_URL)
print(transcript)
print(transcript.id)
```
## Calling Assembly AI EU endpoints
If you want to send your request to the Assembly AI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
```python
import assemblyai as aai
LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
aai.settings.base_url = LITELLM_PROXY_BASE_URL
# URL of the file to transcribe
FILE_URL = "https://assembly.ai/wildfires.mp3"
# You can also transcribe a local file by passing in a file path
# FILE_URL = './path/to/file.mp3'
transcriber = aai.Transcriber()
transcript = transcriber.transcribe(FILE_URL)
print(transcript)
print(transcript.id)
```

View file

@ -0,0 +1,95 @@
# OpenAI Passthrough
Pass-through endpoints for `/openai`
## Overview
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | Works across all integrations |
| Streaming | ✅ | Fully supported |
### When to use this?
- For 90% of your use cases, you should use the [native LiteLLM OpenAI Integration](https://docs.litellm.ai/docs/providers/openai) (`/chat/completions`, `/embeddings`, `/completions`, `/images`, `/batches`, etc.)
- Use this passthrough to call less popular or newer OpenAI endpoints that LiteLLM doesn't fully support yet, such as `/assistants`, `/threads`, `/vector_stores`
Simply replace `https://api.openai.com` with `LITELLM_PROXY_BASE_URL/openai`
## Usage Examples
### Assistants API
#### Create OpenAI Client
Make sure you do the following:
- Point `base_url` to your `LITELLM_PROXY_BASE_URL/openai`
- Use your `LITELLM_API_KEY` as the `api_key`
```python
import openai
client = openai.OpenAI(
base_url="http://0.0.0.0:4000/openai", # <your-proxy-url>/openai
api_key="sk-anything" # <your-proxy-api-key>
)
```
#### Create an Assistant
```python
# Create an assistant
assistant = client.beta.assistants.create(
name="Math Tutor",
instructions="You are a math tutor. Help solve equations.",
model="gpt-4o",
)
```
#### Create a Thread
```python
# Create a thread
thread = client.beta.threads.create()
```
#### Add a Message to the Thread
```python
# Add a message
message = client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="Solve 3x + 11 = 14",
)
```
#### Run the Assistant
```python
# Create a run to get the assistant's response
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=assistant.id,
)
# Check run status
run_status = client.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
```
#### Retrieve Messages
```python
# List messages after the run completes
messages = client.beta.threads.messages.list(
thread_id=thread.id
)
```
#### Delete the Assistant
```python
# Delete the assistant when done
client.beta.assistants.delete(assistant.id)
```

View file

@ -0,0 +1,14 @@
# 🐕 Elroy
Elroy is a scriptable AI assistant that remembers and sets goals.
Interact through the command line, share memories via MCP, or build your own tools using Python.
[![Static Badge][github-shield]][github-url]
[![Discord][discord-shield]][discord-url]
[github-shield]: https://img.shields.io/badge/Github-repo-white?logo=github
[github-url]: https://github.com/elroy-bot/elroy
[discord-shield]:https://img.shields.io/discord/1200684659277832293?color=7289DA&label=Discord&logo=discord&logoColor=white
[discord-url]: https://discord.gg/5PJUY4eMce

View file

@ -0,0 +1,5 @@
PDL - A YAML-based approach to prompt programming
Github: https://github.com/IBM/prompt-declaration-language
PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.

View file

@ -0,0 +1,9 @@
# pgai
[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
If you're already familiar with pgai, you can find litellm specific docs here:
- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.

View file

@ -0,0 +1,8 @@
# 🤗 Smolagents
`smolagents` is a barebones library for agents. Agents write python code to call tools and orchestrate other agents.
- [Github](https://github.com/huggingface/smolagents)
- [Docs](https://huggingface.co/docs/smolagents/index)
- [Build your agent](https://huggingface.co/docs/smolagents/guided_tour)

View file

@ -0,0 +1,160 @@
# AI/ML API
Getting started with the AI/ML API is simple. Follow these steps to set up your integration:
### 1. Get Your API Key
To begin, you need an API key. You can obtain yours here:
🔑 [Get Your API Key](https://aimlapi.com/app/keys/?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)
### 2. Explore Available Models
Looking for a different model? Browse the full list of supported models:
📚 [Full List of Models](https://docs.aimlapi.com/api-overview/model-database/text-models?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)
### 3. Read the Documentation
For detailed setup instructions and usage guidelines, check out the official documentation:
📖 [AI/ML API Docs](https://docs.aimlapi.com/quickstart/setting-up?utm_source=aimlapi&utm_medium=github&utm_campaign=integration)
### 4. Need Help?
If you have any questions, feel free to reach out. Were happy to assist! 🚀 [Discord](https://discord.gg/hvaUsJpVJf)
## Usage
You can choose from LLama, Qwen, Flux, and 200+ other open and closed-source models on aimlapi.com/models. For example:
```python
import litellm
response = litellm.completion(
model="openai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v2",
messages=[
{
"role": "user",
"content": "Hey, how's it going?",
}
],
)
```
## Streaming
```python
import litellm
response = litellm.completion(
model="openai/Qwen/Qwen2-72B-Instruct", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v2",
messages=[
{
"role": "user",
"content": "Hey, how's it going?",
}
],
stream=True,
)
for chunk in response:
print(chunk)
```
## Async Completion
```python
import asyncio
import litellm
async def main():
response = await litellm.acompletion(
model="openai/anthropic/claude-3-5-haiku", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v2",
messages=[
{
"role": "user",
"content": "Hey, how's it going?",
}
],
)
print(response)
if __name__ == "__main__":
asyncio.run(main())
```
## Async Streaming
```python
import asyncio
import traceback
import litellm
async def main():
try:
print("test acompletion + streaming")
response = await litellm.acompletion(
model="openai/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v2",
messages=[{"content": "Hey, how's it going?", "role": "user"}],
stream=True,
)
print(f"response: {response}")
async for chunk in response:
print(chunk)
except:
print(f"error occurred: {traceback.format_exc()}")
pass
if __name__ == "__main__":
asyncio.run(main())
```
## Async Embedding
```python
import asyncio
import litellm
async def main():
response = await litellm.aembedding(
model="openai/text-embedding-3-small", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v1", # 👈 the URL has changed from v2 to v1
input="Your text string",
)
print(response)
if __name__ == "__main__":
asyncio.run(main())
```
## Async Image Generation
```python
import asyncio
import litellm
async def main():
response = await litellm.aimage_generation(
model="openai/dall-e-3", # The model name must include prefix "openai" + the model name from ai/ml api
api_key="", # your aiml api-key
api_base="https://api.aimlapi.com/v1", # 👈 the URL has changed from v2 to v1
prompt="A cute baby sea otter",
)
print(response)
if __name__ == "__main__":
asyncio.run(main())
```

View file

@ -819,6 +819,114 @@ resp = litellm.completion(
print(f"\nResponse: {resp}")
```
## Usage - Thinking / `reasoning_content`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-7-sonnet-20250219",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
provider_specific_fields={
'citations': None,
'thinking_blocks': [
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6...'
}
]
}
),
thinking_blocks=[
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6AGB...'
}
],
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
## **Passing Extra Headers to Anthropic API**
Pass `extra_headers: dict` to `litellm.completion`
@ -987,6 +1095,106 @@ curl http://0.0.0.0:4000/v1/chat/completions \
</TabItem>
</Tabs>
## [BETA] Citations API
Pass `citations: {"enabled": true}` to Anthropic, to get citations on your document responses.
Note: This interface is in BETA. If you have feedback on how citations should be returned, please [tell us here](https://github.com/BerriAI/litellm/issues/7970#issuecomment-2644437943)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="claude-3-5-sonnet-20241022",
messages=[
{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "text",
"media_type": "text/plain",
"data": "The grass is green. The sky is blue.",
},
"title": "My Document",
"context": "This is a trustworthy document.",
"citations": {"enabled": True},
},
{
"type": "text",
"text": "What color is the grass and sky?",
},
],
}
],
)
citations = resp.choices[0].message.provider_specific_fields["citations"]
assert citations is not None
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "anthropic-claude",
"messages": [
{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "text",
"media_type": "text/plain",
"data": "The grass is green. The sky is blue.",
},
"title": "My Document",
"context": "This is a trustworthy document.",
"citations": {"enabled": True},
},
{
"type": "text",
"text": "What color is the grass and sky?",
},
],
}
]
}'
```
</TabItem>
</Tabs>
## Usage - passing 'user_id' to Anthropic
LiteLLM translates the OpenAI `user` param to Anthropic's `metadata[user_id]` param.
@ -1035,3 +1243,4 @@ curl http://0.0.0.0:4000/v1/chat/completions \
</TabItem>
</Tabs>

View file

@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
| Property | Details |
|-------|-------|
| Description | Azure OpenAI Service provides REST API access to OpenAI's powerful language models including o1, o1-mini, GPT-4o, GPT-4o mini, GPT-4 Turbo with Vision, GPT-4, GPT-3.5-Turbo, and Embeddings model series |
| Provider Route on LiteLLM | `azure/` |
| Provider Route on LiteLLM | `azure/`, [`azure/o_series/`](#azure-o-series-models) |
| Supported Operations | [`/chat/completions`](#azure-openai-chat-completion-models), [`/completions`](#azure-instruct-models), [`/embeddings`](../embedding/supported_embedding#azure-openai-embedding-models), [`/audio/speech`](#azure-text-to-speech-tts), [`/audio/transcriptions`](../audio_transcription), `/fine_tuning`, [`/batches`](#azure-batches-api), `/files`, [`/images`](../image_generation#azure-openai-image-generation-models) |
| Link to Provider Doc | [Azure OpenAI ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview)
@ -948,6 +948,65 @@ Expected Response:
{"data":[{"id":"batch_R3V...}
```
## O-Series Models
Azure OpenAI O-Series models are supported on LiteLLM.
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
**Automatic Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o3-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
**Explicit Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o_series/my-random-deployment-name
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
## Advanced
### Azure API Load-Balancing

View file

@ -2,7 +2,17 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# AWS Bedrock
ALL Bedrock models (Anthropic, Meta, Mistral, Amazon, etc.) are Supported
ALL Bedrock models (Anthropic, Meta, Deepseek, Mistral, Amazon, etc.) are Supported
| Property | Details |
|-------|-------|
| Description | Amazon Bedrock is a fully managed service that offers a choice of high-performing foundation models (FMs). |
| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1) |
| Provider Doc | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) |
| Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings`, `/images/generations` |
| Rerank Endpoint | `/rerank` |
| Pass-through Endpoint | [Supported](../pass_through/bedrock.md) |
LiteLLM requires `boto3` to be installed on your system for Bedrock requests
```shell
@ -276,9 +286,12 @@ print(response)
</TabItem>
</Tabs>
## Usage - Function Calling
## Usage - Function Calling / Tool calling
LiteLLM uses Bedrock's Converse API for making tool calls
LiteLLM supports tool calling via Bedrock's Converse and Invoke API's.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
@ -323,6 +336,69 @@ assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # for bedrock invoke, specify `bedrock/invoke/<model>`
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_API_KEY" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [
{
"role": "user",
"content": "What'\''s the weather like in Boston today?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto"
}'
```
</TabItem>
</Tabs>
## Usage - Vision
@ -367,6 +443,226 @@ print(f"\nResponse: {resp}")
```
## Usage - 'thinking' / 'reasoning content'
This is currently only supported for Anthropic's Claude 3.7 Sonnet + Deepseek R1.
Works on v1.61.20+.
Returns 2 new fields in `message` and `delta` object:
- `reasoning_content` - string - The reasoning content of the response
- `thinking_blocks` - list of objects (Anthropic only) - The thinking blocks of the response
Each object has the following fields:
- `type` - Literal["thinking"] - The type of thinking block
- `thinking` - string - The thinking of the response. Also returned in `reasoning_content`
- `signature` - string - A base64 encoded string, returned by Anthropic.
The `signature` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
resp = completion(
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
print(resp)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON REQUEST
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON CONFIG.YAML
}'
```
</TabItem>
</Tabs>
**Expected Response**
Same as [Anthropic API response](../providers/anthropic#usage---thinking--reasoning_content).
```python
{
"id": "chatcmpl-c661dfd7-7530-49c9-b0cc-d5018ba4727d",
"created": 1740640366,
"model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
"object": "chat.completion",
"system_fingerprint": null,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "The capital of France is Paris. It's not only the capital city but also the largest city in France, serving as the country's major cultural, economic, and political center.",
"role": "assistant",
"tool_calls": null,
"function_call": null,
"reasoning_content": "The capital of France is Paris. This is a straightforward factual question.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris. This is a straightforward factual question.",
"signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+yCHpBY7U6FQW8/FcoLewocJQPa2HnmLM+NECy50y44F/kD4SULFXi57buI9fAvyBwtyjlOiO0SDE3+r3spdg6PLOo9PBoMma2ku5OTAoR46j9VIjDRlvNmBvff7YW4WI9oU8XagaOBSxLPxElrhyuxppEn7m6bfT40dqBSTDrfiw4FYB4qEPETTI6TA6wtjGAAqmFqKTo="
}
]
}
}
],
"usage": {
"completion_tokens": 64,
"prompt_tokens": 42,
"total_tokens": 106,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
```
## Usage - Structured Output / JSON mode
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
from pydantic import BaseModel
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
class EventsList(BaseModel):
events: list[CalendarEvent]
response = completion(
model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", # specify invoke via `bedrock/invoke/anthropic.claude-3-7-sonnet-20250219-v1:0`
response_format=EventsList,
messages=[
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
{"role": "user", "content": "Who won the world series in 2020?"}
],
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # specify invoke via `bedrock/invoke/<model_name>`
aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant designed to output JSON."
},
{
"role": "user",
"content": "Who won the worlde series in 2020?"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"description": "reason about maths",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": false
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": false
},
"strict": true
}
}
}'
```
</TabItem>
</Tabs>
## Usage - Bedrock Guardrails
Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
@ -792,6 +1088,16 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
LiteLLM supports Document Understanding for Bedrock models - [AWS Bedrock Docs](https://docs.aws.amazon.com/nova/latest/userguide/modalities-document.html).
:::info
LiteLLM supports ALL Bedrock document types -
E.g.: "pdf", "csv", "doc", "docx", "xls", "xlsx", "html", "txt", "md"
You can also pass these as either `image_url` or `base64`
:::
### url
<Tabs>
@ -1191,6 +1497,209 @@ response = completion(
aws_bedrock_client=bedrock,
)
```
## Calling via Internal Proxy
Use the `bedrock/converse_like/model` endpoint to call bedrock converse model via your internal proxy.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
response = completion(
model="bedrock/converse_like/some-model",
messages=[{"role": "user", "content": "What's AWS?"}],
api_key="sk-1234",
api_base="https://some-api-url/models",
extra_headers={"test": "hello world"},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
1. Setup config.yaml
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: bedrock/converse_like/some-model
api_base: https://some-api-url/models
```
2. Start proxy server
```bash
litellm --config config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "anthropic-claude",
"messages": [
{
"role": "system",
"content": "You are a helpful math tutor. Guide the user through the solution step by step."
},
{ "content": "Hello, how are you?", "role": "user" }
]
}'
```
</TabItem>
</Tabs>
**Expected Output URL**
```bash
https://some-api-url/models
```
## Bedrock Imported Models (Deepseek, Deepseek R1)
### Deepseek R1
This is a separate route, as the chat template is different.
| Property | Details |
|----------|---------|
| Provider Route | `bedrock/deepseek_r1/{model_arn}` |
| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
response = completion(
model="bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/deepseek_r1/{your-model-arn}
messages=[{"role": "user", "content": "Tell me a joke"}],
)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: DeepSeek-R1-Distill-Llama-70B
litellm_params:
model: bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
### Deepseek (not R1)
| Property | Details |
|----------|---------|
| Provider Route | `bedrock/llama/{model_arn}` |
| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) |
Use this route to call Bedrock Imported Models that follow the `llama` Invoke Request / Response spec
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
response = completion(
model="bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/llama/{your-model-arn}
messages=[{"role": "user", "content": "Tell me a joke"}],
)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: DeepSeek-R1-Distill-Llama-70B
litellm_params:
model: bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
## Provisioned throughput models
@ -1406,3 +1915,5 @@ curl http://0.0.0.0:4000/rerank \
</TabItem>
</Tabs>

View file

@ -23,14 +23,16 @@ import os
os.environ['CEREBRAS_API_KEY'] = ""
response = completion(
model="cerebras/meta/llama3-70b-instruct",
model="cerebras/llama3-70b-instruct",
messages=[
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
"content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
}
],
max_tokens=10,
# The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
@ -50,15 +52,17 @@ import os
os.environ['CEREBRAS_API_KEY'] = ""
response = completion(
model="cerebras/meta/llama3-70b-instruct",
model="cerebras/llama3-70b-instruct",
messages=[
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
"content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
}
],
stream=True,
max_tokens=10,
# The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],

View file

@ -108,7 +108,7 @@ response = embedding(
### Usage
LiteLLM supports the v1 and v2 clients for Cohere rerank. By default, the `rerank` endpoint uses the v2 client, but you can specify the v1 client by explicitly calling `v1/rerank`
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK Usage">

View file

@ -76,7 +76,7 @@ resp = completion(
)
print(
resp.choices[0].message.provider_specific_fields["reasoning_content"]
resp.choices[0].message.reasoning_content
)
```

View file

@ -688,7 +688,9 @@ response = litellm.completion(
|-----------------------|--------------------------------------------------------|--------------------------------|
| gemini-pro | `completion(model='gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro-vision | `completion(model='gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-2.0-flash | `completion(model='gemini/gemini-2.0-flash', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-2.0-flash-exp | `completion(model='gemini/gemini-2.0-flash-exp', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-2.0-flash-lite-preview-02-05 | `completion(model='gemini/gemini-2.0-flash-lite-preview-02-05', messages)` | `os.environ['GEMINI_API_KEY']` |

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Infinity
| Property | Details |
@ -12,6 +15,9 @@
```python
from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank(
model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
```
## Supported Cohere Rerank API Params
| Param | Type | Description |
|-------|-------|-------|
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
### Usage - Return Documents
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
return_documents=True,
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of France?",
"documents": [
"Paris",
"London",
"Berlin",
"Madrid"
],
"return_documents": True,
}'
```
</TabItem>
</Tabs>
## Pass Provider-specific Params
Any unmapped params will be passed to the provider as-is.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: custom-infinity-rerank
litellm_params:
model: infinity/rerank
api_base: https://localhost:8080
raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
```
2. Start litellm
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of the United States?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. is the capital of the United States.",
"Capital punishment has existed in the United States since before it was a country."
],
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}'
```
</TabItem>
</Tabs>

View file

@ -3,13 +3,15 @@ import TabItem from '@theme/TabItem';
# LiteLLM Proxy (LLM Gateway)
:::tip
[LiteLLM Providers a **self hosted** proxy server (AI Gateway)](../simple_proxy) to call all the LLMs in the OpenAI format
| Property | Details |
|-------|-------|
| Description | LiteLLM Proxy is an OpenAI-compatible gateway that allows you to interact with multiple LLM providers through a unified API. Simply use the `litellm_proxy/` prefix before the model name to route your requests through the proxy. |
| Provider Route on LiteLLM | `litellm_proxy/` (add this prefix to the model name, to route any requests to litellm_proxy - e.g. `litellm_proxy/your-model-name`) |
| Setup LiteLLM Gateway | [LiteLLM Gateway ↗](../simple_proxy) |
| Supported Endpoints |`/chat/completions`, `/completions`, `/embeddings`, `/audio/speech`, `/audio/transcriptions`, `/images`, `/rerank` |
:::
**[LiteLLM Proxy](../simple_proxy) is OpenAI compatible**, you just need the `litellm_proxy/` prefix before the model
## Required Variables
@ -83,7 +85,76 @@ for chunk in response:
print(chunk)
```
## Embeddings
```python
import litellm
response = litellm.embedding(
model="litellm_proxy/your-embedding-model",
input="Hello world",
api_base="your-litellm-proxy-url",
api_key="your-litellm-proxy-api-key"
)
```
## Image Generation
```python
import litellm
response = litellm.image_generation(
model="litellm_proxy/dall-e-3",
prompt="A beautiful sunset over mountains",
api_base="your-litellm-proxy-url",
api_key="your-litellm-proxy-api-key"
)
```
## Audio Transcription
```python
import litellm
response = litellm.transcription(
model="litellm_proxy/whisper-1",
file="your-audio-file",
api_base="your-litellm-proxy-url",
api_key="your-litellm-proxy-api-key"
)
```
## Text to Speech
```python
import litellm
response = litellm.speech(
model="litellm_proxy/tts-1",
input="Hello world",
api_base="your-litellm-proxy-url",
api_key="your-litellm-proxy-api-key"
)
```
## Rerank
```python
import litellm
import litellm
response = litellm.rerank(
model="litellm_proxy/rerank-english-v2.0",
query="What is machine learning?",
documents=[
"Machine learning is a field of study in artificial intelligence",
"Biology is the study of living organisms"
],
api_base="your-litellm-proxy-url",
api_key="your-litellm-proxy-api-key"
)
```
## **Usage with Langchain, LLamaindex, OpenAI Js, Anthropic SDK, Instructor**
#### [Follow this doc to see how to use litellm proxy with langchain, llamaindex, anthropic etc](../proxy/user_keys)

View file

@ -69,7 +69,7 @@ for chunk in response:
## Usage with LiteLLM Proxy Server
Here's how to call a XAI model with the LiteLLM Proxy Server
Here's how to call a LM Studio model with the LiteLLM Proxy Server
1. Modify the config.yaml

View file

@ -238,6 +238,76 @@ Ollama supported models: https://github.com/ollama/ollama
| Nous-Hermes 13B | `completion(model='ollama/nous-hermes:13b', messages, api_base="http://localhost:11434", stream=True)` |
| Wizard Vicuna Uncensored | `completion(model='ollama/wizard-vicuna', messages, api_base="http://localhost:11434", stream=True)` |
### JSON Schema support
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
response = completion(
model="ollama_chat/deepseek-r1",
messages=[{ "content": "respond in 20 words. who are you?","role": "user"}],
response_format={"type": "json_schema", "json_schema": {"schema": {"type": "object", "properties": {"name": {"type": "string"}}}}},
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: "deepseek-r1"
litellm_params:
model: "ollama_chat/deepseek-r1"
api_base: "http://localhost:11434"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING ON http://0.0.0.0:4000
```
3. Test it!
```python
from pydantic import BaseModel
from openai import OpenAI
client = OpenAI(
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
)
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
completion = client.beta.chat.completions.parse(
model="deepseek-r1",
messages=[
{"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
{"role": "user", "content": "how can I solve 8x + 7 = -23"}
],
response_format=MathReasoning,
)
math_reasoning = completion.choices[0].message.parsed
```
</TabItem>
</Tabs>
## Ollama Vision Models
| Model Name | Function Call |
|------------------|--------------------------------------|

View file

@ -64,71 +64,7 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported
## Return citations
Perplexity supports returning citations via `return_citations=True`. [Perplexity Docs](https://docs.perplexity.ai/reference/post_chat_completions). Note: Perplexity has this feature in **closed beta**, so you need them to grant you access to get citations from their API.
If perplexity returns citations, LiteLLM will pass it straight through.
:::info
For passing more provider-specific, [go here](../completion/provider_specific_params.md)
For more information about passing provider-specific parameters, [go here](../completion/provider_specific_params.md)
:::
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ['PERPLEXITYAI_API_KEY'] = ""
response = completion(
model="perplexity/mistral-7b-instruct",
messages=messages,
return_citations=True
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add perplexity to config.yaml
```yaml
model_list:
- model_name: "perplexity-model"
litellm_params:
model: "llama-3.1-sonar-small-128k-online"
api_key: os.environ/PERPLEXITY_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "perplexity-model",
"messages": [
{
"role": "user",
"content": "Who won the world cup in 2022?"
}
],
"return_citations": true
}'
```
[**Call w/ OpenAI SDK, Langchain, Instructor, etc.**](../proxy/user_keys.md#chatcompletions)
</TabItem>
</Tabs>

View file

@ -2,11 +2,11 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Sambanova
https://community.sambanova.ai/t/create-chat-completion-api/
https://cloud.sambanova.ai/
:::tip
**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://sambanova.ai/technology/models **
**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://docs.sambanova.ai/cloud/docs/get-started/supported-models **
:::
@ -27,12 +27,11 @@ response = completion(
messages=[
{
"role": "user",
"content": "What do you know about sambanova.ai",
"content": "What do you know about sambanova.ai. Give your response in json format",
}
],
max_tokens=10,
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
temperature=0.2,
top_p=0.9,
@ -54,13 +53,12 @@ response = completion(
messages=[
{
"role": "user",
"content": "What do you know about sambanova.ai",
"content": "What do you know about sambanova.ai. Give your response in json format",
}
],
stream=True,
max_tokens=10,
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
temperature=0.2,
top_p=0.9,

View file

@ -0,0 +1,27 @@
# Topaz
| Property | Details |
|-------|-------|
| Description | Professional-grade photo and video editing powered by AI. |
| Provider Route on LiteLLM | `topaz/` |
| Provider Doc | [Topaz ↗](https://www.topazlabs.com/enhance-api) |
| API Endpoint for Provider | https://api.topazlabs.com |
| Supported OpenAI Endpoints | `/image/variations` |
## Quick Start
```python
from litellm import image_variation
import os
os.environ["TOPAZ_API_KEY"] = ""
response = image_variation(
model="topaz/Standard V2", image=image_url
)
```
## Supported OpenAI Params
- `response_format`
- `size` (widthxheight)

View file

@ -405,13 +405,15 @@ If this was your initial VertexAI Grounding code,
```python
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel("gemini-1.5-flash-001")
# Use Google Search for grounding
tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())
prompt = "When is the next total solar eclipse in US?"
response = model.generate_content(
@ -852,6 +854,7 @@ litellm.vertex_location = "us-central1 # Your Location
| claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
| claude-3-7-sonnet@20250219 | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |
### Usage
@ -926,6 +929,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</Tabs>
### Usage - `thinking` / `reasoning_content`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="vertex_ai/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: vertex_ai/claude-3-7-sonnet-20250219
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-7-sonnet-20250219",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
provider_specific_fields={
'citations': None,
'thinking_blocks': [
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6...'
}
]
}
),
thinking_blocks=[
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6AGB...'
}
],
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
## Llama 3 API
| Model Name | Function Call |
@ -1572,6 +1688,14 @@ assert isinstance(
Pass any file supported by Vertex AI, through LiteLLM.
LiteLLM Supports the following image types passed in url
```
Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
Base64 Encoded Local Images
```
<Tabs>
<TabItem value="sdk" label="SDK">

View file

@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
</TabItem>
</Tabs>
## Send Video URL to VLLM
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "video_url", "video_url": {"url": video_url}},
```
2. Pass the video data as base64
```
{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "video_url",
"video_url": {
"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
],
api_base="https://hosted-vllm-api.co")
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package`
### Using - `litellm.completion`

View file

@ -14,7 +14,7 @@ import os
os.environ['VOYAGE_API_KEY'] = ""
response = embedding(
model="voyage/voyage-01",
model="voyage/voyage-3-large",
input=["good morning from litellm"],
)
print(response)
@ -23,13 +23,20 @@ print(response)
## Supported Models
All models listed here https://docs.voyageai.com/embeddings/#models-and-specifics are supported
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| voyage-2 | `embedding(model="voyage/voyage-2", input)` |
| voyage-large-2 | `embedding(model="voyage/voyage-large-2", input)` |
| voyage-law-2 | `embedding(model="voyage/voyage-law-2", input)` |
| voyage-code-2 | `embedding(model="voyage/voyage-code-2", input)` |
| Model Name | Function Call |
|-------------------------|------------------------------------------------------------|
| voyage-3-large | `embedding(model="voyage/voyage-3-large", input)` |
| voyage-3 | `embedding(model="voyage/voyage-3", input)` |
| voyage-3-lite | `embedding(model="voyage/voyage-3-lite", input)` |
| voyage-code-3 | `embedding(model="voyage/voyage-code-3", input)` |
| voyage-finance-2 | `embedding(model="voyage/voyage-finance-2", input)` |
| voyage-law-2 | `embedding(model="voyage/voyage-law-2", input)` |
| voyage-code-2 | `embedding(model="voyage/voyage-code-2", input)` |
| voyage-multilingual-2 | `embedding(model="voyage/voyage-multilingual-2 ", input)` |
| voyage-large-2-instruct | `embedding(model="voyage/voyage-large-2-instruct", input)` |
| voyage-large-2 | `embedding(model="voyage/voyage-large-2", input)` |
| voyage-2 | `embedding(model="voyage/voyage-2", input)` |
| voyage-lite-02-instruct | `embedding(model="voyage/voyage-lite-02-instruct", input)` |
| voyage-01 | `embedding(model="voyage/voyage-01", input)` |
| voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` |
| voyage-01 | `embedding(model="voyage/voyage-01", input)` |
| voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` |
| voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` |

View file

@ -1,13 +1,13 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# XAI
# xAI
https://docs.x.ai/docs
:::tip
**We support ALL XAI models, just set `model=xai/<any-model-on-xai>` as a prefix when sending litellm requests**
**We support ALL xAI models, just set `model=xai/<any-model-on-xai>` as a prefix when sending litellm requests**
:::
@ -24,7 +24,7 @@ import os
os.environ['XAI_API_KEY'] = ""
response = completion(
model="xai/grok-beta",
model="xai/grok-2-latest",
messages=[
{
"role": "user",
@ -51,7 +51,7 @@ import os
os.environ['XAI_API_KEY'] = ""
response = completion(
model="xai/grok-beta",
model="xai/grok-2-latest",
messages=[
{
"role": "user",
@ -74,6 +74,35 @@ for chunk in response:
print(chunk)
```
## Sample Usage - Vision
```python
import os
from litellm import completion
os.environ["XAI_API_KEY"] = "your-api-key"
response = completion(
model="xai/grok-2-latest",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://science.nasa.gov/wp-content/uploads/2023/09/web-first-images-release.png",
"detail": "high",
},
},
{
"type": "text",
"text": "What's in this image?",
},
],
},
],
)
```
## Usage with LiteLLM Proxy Server

View file

@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
- Virtual Key Rate Limit
- User Rate Limit
- Team Limit
- The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
- The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
## Frequently Asked Questions

View file

@ -1,154 +0,0 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Logging GCS, s3 Buckets
LiteLLM Supports Logging to the following Cloud Buckets
- (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
- (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
## Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
| Property | Details |
|----------|---------|
| Description | Log LLM Input/Output to cloud storage buckets |
| Load Test Benchmarks | [Benchmarks](https://docs.litellm.ai/docs/benchmarks) |
| Google Docs on Cloud Storage | [Google Cloud Storage](https://cloud.google.com/storage?hl=en) |
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
### Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
### Fields Logged on GCS Buckets
[**The standard logging object is logged on GCS Bucket**](../proxy/logging)
### Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket

View file

@ -2,7 +2,6 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Caching
Cache LLM Responses
:::note
@ -10,14 +9,19 @@ For OpenAI/Anthropic Prompt Caching, go [here](../completion/prompt_caching.md)
:::
LiteLLM supports:
Cache LLM Responses. LiteLLM's caching system stores and reuses LLM responses to save costs and reduce latency. When you make the same request twice, the cached response is returned instead of calling the LLM API again.
### Supported Caches
- In Memory Cache
- Redis Cache
- Qdrant Semantic Cache
- Redis Semantic Cache
- s3 Bucket Cache
## Quick Start - Redis, s3 Cache, Semantic Cache
## Quick Start
<Tabs>
<TabItem value="redis" label="redis cache">
@ -369,9 +373,9 @@ $ litellm --config /path/to/config.yaml
</Tabs>
## Usage
## Using Caching - /chat/completions
### Basic
<Tabs>
<TabItem value="chat_completions" label="/chat/completions">
@ -416,6 +420,239 @@ curl --location 'http://0.0.0.0:4000/embeddings' \
</TabItem>
</Tabs>
### Dynamic Cache Controls
| Parameter | Type | Description |
|-----------|------|-------------|
| `ttl` | *Optional(int)* | Will cache the response for the user-defined amount of time (in seconds) |
| `s-maxage` | *Optional(int)* | Will only accept cached responses that are within user-defined range (in seconds) |
| `no-cache` | *Optional(bool)* | Will not store the response in cache. |
| `no-store` | *Optional(bool)* | Will not cache the response |
| `namespace` | *Optional(str)* | Will cache the response under a user-defined namespace |
Each cache parameter can be controlled on a per-request basis. Here are examples for each parameter:
### `ttl`
Set how long (in seconds) to cache a response.
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello"}],
model="gpt-3.5-turbo",
extra_body={
"cache": {
"ttl": 300 # Cache response for 5 minutes
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"ttl": 300},
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
### `s-maxage`
Only accept cached responses that are within the specified age (in seconds).
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello"}],
model="gpt-3.5-turbo",
extra_body={
"cache": {
"s-maxage": 600 # Only use cache if less than 10 minutes old
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"s-maxage": 600},
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
### `no-cache`
Force a fresh response, bypassing the cache.
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello"}],
model="gpt-3.5-turbo",
extra_body={
"cache": {
"no-cache": True # Skip cache check, get fresh response
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"no-cache": true},
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
### `no-store`
Will not store the response in cache.
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello"}],
model="gpt-3.5-turbo",
extra_body={
"cache": {
"no-store": True # Don't cache this response
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"no-store": true},
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
### `namespace`
Store the response under a specific cache namespace.
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "Hello"}],
model="gpt-3.5-turbo",
extra_body={
"cache": {
"namespace": "my-custom-namespace" # Store in custom namespace
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"namespace": "my-custom-namespace"},
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
## Set cache for proxy, but not on the actual llm api call
Use this if you just want to enable features like rate limiting, and loadbalancing across multiple instances.
@ -501,253 +738,6 @@ litellm_settings:
# /chat/completions, /completions, /embeddings, /audio/transcriptions
```
### **Turn on / off caching per request. **
The proxy support 4 cache-controls:
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint.
- `no-store`: *Optional(bool)* Will not cache the response.
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
**Turn off caching**
Set `no-cache=True`, this will not return a cached response
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
extra_body = { # OpenAI python accepts extra args in extra_body
cache: {
"no-cache": True # will not return a cached response
}
}
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"no-cache": True},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Turn on caching**
By default cache is always on
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo"
)
```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Set `ttl`**
Set `ttl=600`, this will caches response for 10 minutes (600 seconds)
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
extra_body = { # OpenAI python accepts extra args in extra_body
cache: {
"ttl": 600 # caches response for 10 minutes
}
}
)
```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"ttl": 600},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Set `s-maxage`**
Set `s-maxage`, this will only get responses cached within last 10 minutes
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo",
extra_body = { # OpenAI python accepts extra args in extra_body
cache: {
"s-maxage": 600 # only get responses cached within last 10 minutes
}
}
)
```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"s-maxage": 600},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
### Turn on / off caching per Key.
1. Add cache params when creating a key [full list](#turn-on--off-caching-per-key)
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"user_id": "222",
"metadata": {
"cache": {
"no-cache": true
}
}
}'
```
2. Test it!
```bash
curl -X POST 'http://localhost:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <YOUR_NEW_KEY>' \
-d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "bom dia"}]}'
```
### Deleting Cache Keys - `/cache/delete`
In order to delete a cache key, send a request to `/cache/delete` with the `keys` you want to delete

View file

@ -139,9 +139,6 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
#### ASYNC ####
async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
pass
async def async_log_pre_api_call(self, model, messages, kwargs):
pass

View file

@ -139,6 +139,7 @@ general_settings:
| disable_end_user_cost_tracking_prometheus_only | boolean | If true, turns off end user cost tracking on prometheus metrics only. |
| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
| disable_add_transform_inline_image_block | boolean | For Fireworks AI models - if true, turns off the auto-add of `#transform=inline` to the url of the image_url, if the model is not a vision model. |
| disable_hf_tokenizer_download | boolean | If true, it defaults to using the openai tokenizer for all models (including huggingface models). |
### general_settings - Reference
@ -177,6 +178,7 @@ general_settings:
| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] |
| image_generation_model | str | The default model to use for image generation - ignores model set in request |
| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
| store_prompts_in_spend_logs | boolean | If true, allows prompts and responses to be stored in the spend logs table. |
| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** |
@ -366,6 +368,8 @@ router_settings:
| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file
| GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds**
| GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048**
| GCS_PUBSUB_TOPIC_ID | PubSub Topic ID to send LiteLLM SpendLogs to.
| GCS_PUBSUB_PROJECT_ID | PubSub Project ID to send LiteLLM SpendLogs to.
| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers
| GENERIC_CLIENT_ID | Client ID for generic OAuth providers
| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers
@ -462,6 +466,9 @@ router_settings:
| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
| PAGERDUTY_API_KEY | API key for PagerDuty Alerting
| PHOENIX_API_KEY | API key for Arize Phoenix
| PHOENIX_COLLECTOR_ENDPOINT | API endpoint for Arize Phoenix
| PHOENIX_COLLECTOR_HTTP_ENDPOINT | API http endpoint for Arize Phoenix
| POD_NAME | Pod name for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) as `POD_NAME`
| PREDIBASE_API_BASE | Base URL for Predibase API
| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
@ -484,12 +491,12 @@ router_settings:
| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
| SLACK_WEBHOOK_URL | Webhook URL for Slack integration
| SMTP_HOST | Hostname for the SMTP server
| SMTP_PASSWORD | Password for SMTP authentication
| SMTP_PASSWORD | Password for SMTP authentication (do not set if SMTP does not require auth)
| SMTP_PORT | Port number for SMTP server
| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
| SMTP_TLS | Flag to enable or disable TLS for SMTP connections
| SMTP_USERNAME | Username for SMTP authentication
| SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth)
| SPEND_LOGS_URL | URL for retrieving spend logs
| SSL_CERTIFICATE | Path to the SSL certificate file
| SSL_VERIFY | Flag to enable or disable SSL certificate verification

View file

@ -0,0 +1,48 @@
# Custom Auth
You can now override the default api key auth.
Here's how:
#### 1. Create a custom auth file.
Make sure the response type follows the `UserAPIKeyAuth` pydantic object. This is used by for logging usage specific to that user key.
```python
from litellm.proxy._types import UserAPIKeyAuth
async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
try:
modified_master_key = "sk-my-master-key"
if api_key == modified_master_key:
return UserAPIKeyAuth(api_key=api_key)
raise Exception
except:
raise Exception
```
#### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
```yaml
model_list:
- model_name: "openai-model"
litellm_params:
model: "gpt-3.5-turbo"
litellm_settings:
drop_params: True
set_verbose: True
general_settings:
custom_auth: custom_auth.user_api_key_auth
```
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
#### 3. Start the proxy
```shell
$ litellm --config /path/to/config.yaml
```

View file

@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |
## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
## Disable `LiteLLM_SpendLogs`
You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
```yaml
general_settings:
disable_spend_logs: True # Disable writing spend logs to DB
disable_error_logs: True # Disable writing error logs to DB
disable_error_logs: True # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
```
### What is the impact of disabling these logs?

View file

@ -14,7 +14,7 @@ Features:
- **Security**
- ✅ [SSO for Admin UI](./ui.md#✨-enterprise-features)
- ✅ [Audit Logs with retention policy](#audit-logs)
- ✅ [JWT-Auth](../docs/proxy/token_auth.md)
- ✅ [JWT-Auth](./token_auth.md)
- ✅ [Control available public, private routes (Restrict certain endpoints on proxy)](#control-available-public-private-routes)
- ✅ [Control available public, private routes](#control-available-public-private-routes)
- ✅ [Secret Managers - AWS Key Manager, Google Secret Manager, Azure Key, Hashicorp Vault](../secret)
@ -24,6 +24,7 @@ Features:
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ [Set Max Request Size / File Size on Requests](#set-max-request--response-size-on-litellm-proxy)
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- ✅ [Key Rotations](./virtual_keys.md#-key-rotations)
- **Customize Logging, Guardrails, Caching per project**
- ✅ [Team Based Logging](./team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
- ✅ [Disable Logging for a Team](./team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
@ -39,8 +40,8 @@ Features:
- **Control Guardrails per API Key**
- **Custom Branding**
- ✅ [Custom Branding + Routes on Swagger Docs](#swagger-docs---custom-routes--branding)
- ✅ [Public Model Hub](../docs/proxy/enterprise.md#public-model-hub)
- ✅ [Custom Email Branding](../docs/proxy/email.md#customizing-email-branding)
- ✅ [Public Model Hub](#public-model-hub)
- ✅ [Custom Email Branding](./email.md#customizing-email-branding)
## Audit Logs

View file

@ -37,7 +37,7 @@ guardrails:
- guardrail_name: aim-protected-app
litellm_params:
guardrail: aim
mode: pre_call
mode: pre_call # 'during_call' is also available
api_key: os.environ/AIM_API_KEY
api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
```

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Quick Start
# Guardrails - Quick Start
Setup Prompt Injection Detection, PII Masking on LiteLLM Proxy (AI Gateway)
@ -121,6 +121,49 @@ curl -i http://localhost:4000/v1/chat/completions \
</Tabs>
## **Default On Guardrails**
Set `default_on: true` in your guardrail config to run the guardrail on every request. This is useful if you want to run a guardrail on every request without the user having to specify it.
**Note:** These will run even if user specifies a different guardrail or empty guardrails array.
```yaml
guardrails:
- guardrail_name: "aporia-pre-guard"
litellm_params:
guardrail: aporia
mode: "pre_call"
default_on: true
```
**Test Request**
In this request, the guardrail `aporia-pre-guard` will run on every request because `default_on: true` is set.
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-npnwjPQciVRok5yNZgKmFQ" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "hi my email is ishaan@berri.ai"}
]
}'
```
**Expected response**
Your response headers will incude `x-litellm-applied-guardrails` with the guardrail applied
```
x-litellm-applied-guardrails: aporia-pre-guard
```
## **Using Guardrails Client Side**
### Test yourself **(OSS)**
@ -349,7 +392,7 @@ Monitor which guardrails were executed and whether they passed or failed. e.g. g
### ✨ Control Guardrails per Project (API Key)
### ✨ Control Guardrails per API Key
:::info
@ -357,7 +400,7 @@ Monitor which guardrails were executed and whether they passed or failed. e.g. g
:::
Use this to control what guardrails run per project. In this tutorial we only want the following guardrails to run for 1 project (API Key)
Use this to control what guardrails run per API Key. In this tutorial we only want the following guardrails to run for 1 API Key
- `guardrails`: ["aporia-pre-guard", "aporia-post-guard"]
**Step 1** Create Key with guardrail settings
@ -481,9 +524,10 @@ guardrails:
- guardrail_name: string # Required: Name of the guardrail
litellm_params: # Required: Configuration parameters
guardrail: string # Required: One of "aporia", "bedrock", "guardrails_ai", "lakera", "presidio", "hide-secrets"
mode: string # Required: One of "pre_call", "post_call", "during_call", "logging_only"
mode: Union[string, List[string]] # Required: One or more of "pre_call", "post_call", "during_call", "logging_only"
api_key: string # Required: API key for the guardrail service
api_base: string # Optional: Base URL for the guardrail service
default_on: boolean # Optional: Default False. When set to True, will run on every request, does not need client to specify guardrail in request
guardrail_info: # Optional[Dict]: Additional information about the guardrail
```

View file

@ -314,6 +314,17 @@ Example Response:
"I'm alive!"
```
## `/health/services`
Use this admin-only endpoint to check if a connected service (datadog/slack/langfuse/etc.) is healthy.
```bash
curl -L -X GET 'http://0.0.0.0:4000/health/services?service=datadog' -H 'Authorization: Bearer sk-1234'
```
[**API Reference**](https://litellm-api.up.railway.app/#/health/health_services_endpoint_health_services_get)
## Advanced - Call specific models
To check health of specific models, here's how to call them:

View file

@ -0,0 +1,116 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Control Model Access with OIDC (Azure AD/Keycloak/etc.)
:::info
✨ JWT Auth is on LiteLLM Enterprise
[Enterprise Pricing](https://www.litellm.ai/#pricing)
[Get free 7-day trial key](https://www.litellm.ai/#trial)
:::
<Image img={require('../../img/control_model_access_jwt.png')} style={{ width: '100%', maxWidth: '4000px' }} />
## Example Token
<Tabs>
<TabItem value="Azure AD">
```bash
{
"sub": "1234567890",
"name": "John Doe",
"email": "john.doe@example.com",
"roles": ["basic_user"] # 👈 ROLE
}
```
</TabItem>
<TabItem value="Keycloak">
```bash
{
"sub": "1234567890",
"name": "John Doe",
"email": "john.doe@example.com",
"resource_access": {
"litellm-test-client-id": {
"roles": ["basic_user"] # 👈 ROLE
}
}
}
```
</TabItem>
</Tabs>
## Proxy Configuration
<Tabs>
<TabItem value="Azure AD">
```yaml
general_settings:
enable_jwt_auth: True
litellm_jwtauth:
user_roles_jwt_field: "roles" # the field in the JWT that contains the roles
user_allowed_roles: ["basic_user"] # roles that map to an 'internal_user' role on LiteLLM
enforce_rbac: true # if true, will check if the user has the correct role to access the model
role_permissions: # control what models are allowed for each role
- role: internal_user
models: ["anthropic-claude"]
model_list:
- model: anthropic-claude
litellm_params:
model: claude-3-5-haiku-20241022
- model: openai-gpt-4o
litellm_params:
model: gpt-4o
```
</TabItem>
<TabItem value="Keycloak">
```yaml
general_settings:
enable_jwt_auth: True
litellm_jwtauth:
user_roles_jwt_field: "resource_access.litellm-test-client-id.roles" # the field in the JWT that contains the roles
user_allowed_roles: ["basic_user"] # roles that map to an 'internal_user' role on LiteLLM
enforce_rbac: true # if true, will check if the user has the correct role to access the model
role_permissions: # control what models are allowed for each role
- role: internal_user
models: ["anthropic-claude"]
model_list:
- model: anthropic-claude
litellm_params:
model: claude-3-5-haiku-20241022
- model: openai-gpt-4o
litellm_params:
model: gpt-4o
```
</TabItem>
</Tabs>
## How it works
1. Specify JWT_PUBLIC_KEY_URL - This is the public keys endpoint of your OpenID provider. For Azure AD it's `https://login.microsoftonline.com/{tenant_id}/discovery/v2.0/keys`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
1. Map JWT roles to LiteLLM roles - Done via `user_roles_jwt_field` and `user_allowed_roles`
- Currently just `internal_user` is supported for role mapping.
2. Specify model access:
- `role_permissions`: control what models are allowed for each role.
- `role`: the LiteLLM role to control access for. Allowed roles = ["internal_user", "proxy_admin", "team"]
- `models`: list of models that the role is allowed to access.
- `model_list`: parent list of models on the proxy. [Learn more](./configs.md#llm-configs-model_list)
3. Model Checks: The proxy will run validation checks on the received JWT. [Code](https://github.com/BerriAI/litellm/blob/3a4f5b23b5025b87b6d969f2485cc9bc741f9ba6/litellm/proxy/auth/user_api_key_auth.py#L284)

View file

@ -1,3 +1,7 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Logging
Log Proxy input, output, and exceptions using:
@ -13,9 +17,7 @@ Log Proxy input, output, and exceptions using:
- DynamoDB
- etc.
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
## Getting the LiteLLM Call ID
@ -77,10 +79,13 @@ litellm_settings:
### Redact Messages, Response Content
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata will still be logged.
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata - e.g. spend, will still be tracked.
<Tabs>
Example config.yaml
<TabItem value="global" label="Global">
**1. Setup config.yaml **
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -91,9 +96,87 @@ litellm_settings:
turn_off_message_logging: True # 👈 Key Change
```
If you have this feature turned on, you can override it for specific requests by
**2. Send request**
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
</TabItem>
<TabItem value="dynamic" label="Per Request">
:::info
Dynamic request message redaction is in BETA.
:::
Pass in a request header to enable message redaction for a request.
```
x-litellm-enable-message-redaction: true
```
Example config.yaml
**1. Setup config.yaml **
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
```
**2. Setup per request header**
```shell
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-zV5HlSIm8ihj1F9C_ZbB1g' \
-H 'x-litellm-enable-message-redaction: true' \
-d '{
"model": "gpt-3.5-turbo-testing",
"messages": [
{
"role": "user",
"content": "Hey, how'\''s it going 1234?"
}
]
}'
```
</TabItem>
</Tabs>
**3. Check Logging Tool + Spend Logs**
**Logging Tool**
<Image img={require('../../img/message_redaction_logging.png')}/>
**Spend Logs**
<Image img={require('../../img/message_redaction_spend_logs.png')} />
### Disable Message Redaction
If you have `litellm.turn_on_message_logging` turned on, you can override it for specific requests by
setting a request header `LiteLLM-Disable-Message-Redaction: true`.
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -109,13 +192,21 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
Removes any field with `user_api_key_*` from metadata.
### Turn off all tracking/logging
For some use cases, you may want to turn off all tracking/logging. You can do this by passing `no-log=True` in the request body.
:::info
Disable this by setting `global_disable_no_log_param:true` in your config.yaml file.
```yaml
litellm_settings:
global_disable_no_log_param: True
```
:::
<Tabs>
<TabItem value="Curl" label="Curl Request">
@ -1025,6 +1116,74 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## Google Cloud Storage - PubSub Topic
Log LLM Logs/SpendLogs to [Google Cloud Storage PubSub Topic](https://cloud.google.com/pubsub/docs/reference/rest)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
| Property | Details |
|----------|---------|
| Description | Log LiteLLM `SpendLogs Table` to Google Cloud Storage PubSub Topic |
When to use `gcs_pubsub`?
- If your LiteLLM Database has crossed 1M+ spend logs and you want to send `SpendLogs` to a PubSub Topic that can be consumed by GCS BigQuery
#### Usage
1. Add `gcs_pubsub` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://exampleopenaiendpoint-production.up.railway.app/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_pubsub"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_PUBSUB_TOPIC_ID="litellmDB"
GCS_PUBSUB_PROJECT_ID="reliableKeys"
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
## s3 Buckets
We will use the `--config` to set
@ -1301,7 +1460,7 @@ LiteLLM supports customizing the following Datadog environment variables
## Lunary
### Step1: Install dependencies and set your environment variables
#### Step1: Install dependencies and set your environment variables
Install the dependencies
```shell
pip install litellm lunary
@ -1312,7 +1471,7 @@ Get you Lunary public key from from https://app.lunary.ai/settings
export LUNARY_PUBLIC_KEY="<your-public-key>"
```
### Step 2: Create a `config.yaml` and set `lunary` callbacks
#### Step 2: Create a `config.yaml` and set `lunary` callbacks
```yaml
model_list:
@ -1324,12 +1483,12 @@ litellm_settings:
failure_callback: ["lunary"]
```
### Step 3: Start the LiteLLM proxy
#### Step 3: Start the LiteLLM proxy
```shell
litellm --config config.yaml
```
### Step 4: Make a request
#### Step 4: Make a request
```shell
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
@ -1352,14 +1511,14 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
## MLflow
### Step1: Install dependencies
#### Step1: Install dependencies
Install the dependencies.
```shell
pip install litellm mlflow
```
### Step 2: Create a `config.yaml` with `mlflow` callback
#### Step 2: Create a `config.yaml` with `mlflow` callback
```yaml
model_list:
@ -1371,12 +1530,12 @@ litellm_settings:
failure_callback: ["mlflow"]
```
### Step 3: Start the LiteLLM proxy
#### Step 3: Start the LiteLLM proxy
```shell
litellm --config config.yaml
```
### Step 4: Make a request
#### Step 4: Make a request
```shell
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
@ -1392,7 +1551,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
}'
```
### Step 5: Review traces
#### Step 5: Review traces
Run the following command to start MLflow UI and review recorded traces.
@ -1427,9 +1586,6 @@ class MyCustomHandler(CustomLogger):
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
print(f"Post-API Call")
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print("On Success")

View file

@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
| `api_base` | `Optional[str]` | Optional API base URL |
| `response_cost` | `Optional[str]` | Optional response cost |
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
## StandardLoggingModelInformation

View file

@ -0,0 +1,53 @@
# Rotating Master Key
Here are our recommended steps for rotating your master key.
**1. Backup your DB**
In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
**2. Call `/key/regenerate` with the new master key**
```bash
curl -L -X POST 'http://localhost:4000/key/regenerate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"key": "sk-1234",
"new_master_key": "sk-PIp1h0RekR"
}'
```
This will re-encrypt any models in your Proxy_ModelTable with the new master key.
Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
```bash
raise Exception("Unable to decrypt value={}".format(v))
Exception: Unable to decrypt value=<new-encrypted-value>
```
**3. Update LITELLM_MASTER_KEY**
In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
This ensures the key used for decryption from db is the new key.
**4. Test it**
Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
"messages": [
{
"content": "Hey, how's it going",
"role": "user"
}
],
}'
```

View file

@ -344,3 +344,6 @@ curl -i http://localhost:4000/v1/chat/completions \
</TabItem>
</Tabs>
## [Role Based Access Control (RBAC)](./jwt_auth_arch)

View file

@ -107,9 +107,9 @@ general_settings:
By default, LiteLLM writes several types of logs to the database:
- Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
- LLM Exceptions to the `LiteLLM_SpendLogs` table
If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:
```yaml
general_settings:

View file

@ -57,7 +57,7 @@ http://localhost:4000/metrics
# <proxy_base_url>/metrics
```
## Virtual Keys, Teams, Internal Users Metrics
## Virtual Keys, Teams, Internal Users
Use this for for tracking per [user, key, team, etc.](virtual_keys)
@ -68,6 +68,42 @@ Use this for for tracking per [user, key, team, etc.](virtual_keys)
| `litellm_input_tokens` | input tokens per `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model"` |
| `litellm_output_tokens` | output tokens per `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model"` |
### Team - Budget
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_team_max_budget_metric` | Max Budget for Team Labels: `"team_id", "team_alias"`|
| `litellm_remaining_team_budget_metric` | Remaining Budget for Team (A team created on LiteLLM) Labels: `"team_id", "team_alias"`|
| `litellm_team_budget_remaining_hours_metric` | Hours before the team budget is reset Labels: `"team_id", "team_alias"`|
### Virtual Key - Budget
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_api_key_max_budget_metric` | Max Budget for API Key Labels: `"hashed_api_key", "api_key_alias"`|
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM) Labels: `"hashed_api_key", "api_key_alias"`|
| `litellm_api_key_budget_remaining_hours_metric` | Hours before the API Key budget is reset Labels: `"hashed_api_key", "api_key_alias"`|
### Virtual Key - Rate Limit
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_api_key_requests_for_model` | Remaining Requests for a LiteLLM virtual API key, only if a model-specific rate limit (rpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`|
| `litellm_remaining_api_key_tokens_for_model` | Remaining Tokens for a LiteLLM virtual API key, only if a model-specific token limit (tpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`|
### Initialize Budget Metrics on Startup
If you want to initialize the key/team budget metrics on startup, you can set the `prometheus_initialize_budget_metrics` to `true` in the `config.yaml`
```yaml
litellm_settings:
callbacks: ["prometheus"]
prometheus_initialize_budget_metrics: true
```
## Proxy Level Tracking Metrics
Use this to track overall LiteLLM Proxy usage.
@ -79,12 +115,11 @@ Use this to track overall LiteLLM Proxy usage.
| `litellm_proxy_failed_requests_metric` | Total number of failed responses from proxy - the client did not get a success response from litellm proxy. Labels: `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "exception_status", "exception_class"` |
| `litellm_proxy_total_requests_metric` | Total number of requests made to the proxy server - track number of client side requests. Labels: `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "status_code"` |
## LLM API / Provider Metrics
## LLM Provider Metrics
Use this for LLM API Error monitoring and tracking remaining rate limits and token limits
### Labels Tracked for LLM API Metrics
### Labels Tracked
| Label | Description |
|-------|-------------|
@ -100,7 +135,7 @@ Use this for LLM API Error monitoring and tracking remaining rate limits and tok
| exception_status | The status of the exception, if any |
| exception_class | The class of the exception, if any |
### Success and Failure Metrics for LLM API
### Success and Failure
| Metric Name | Description |
|----------------------|--------------------------------------|
@ -108,15 +143,14 @@ Use this for LLM API Error monitoring and tracking remaining rate limits and tok
| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for a specific LLM deployment. Labels: `"requested_model", "litellm_model_name", "model_id", "api_base", "api_provider", "hashed_api_key", "api_key_alias", "team", "team_alias", "exception_status", "exception_class"` |
| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure. Labels: `"requested_model", "litellm_model_name", "model_id", "api_base", "api_provider", "hashed_api_key", "api_key_alias", "team", "team_alias"` |
### Remaining Requests and Tokens Metrics
### Remaining Requests and Tokens
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment. Labels: `"model_group", "api_provider", "api_base", "litellm_model_name", "hashed_api_key", "api_key_alias"` |
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment. Labels: `"model_group", "api_provider", "api_base", "litellm_model_name", "hashed_api_key", "api_key_alias"` |
### Deployment State Metrics
### Deployment State
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. Labels: `"litellm_model_name", "model_id", "api_base", "api_provider"` |
@ -139,17 +173,6 @@ Use this for LLM API Error monitoring and tracking remaining rate limits and tok
| `litellm_llm_api_latency_metric` | Latency (seconds) for just the LLM API call - tracked for labels "model", "hashed_api_key", "api_key_alias", "team", "team_alias", "requested_model", "end_user", "user" |
| `litellm_llm_api_time_to_first_token_metric` | Time to first token for LLM API call - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` [Note: only emitted for streaming requests] |
## Virtual Key - Budget, Rate Limit Metrics
Metrics used to track LiteLLM Proxy Budgeting and Rate limiting logic
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_team_budget_metric` | Remaining Budget for Team (A team created on LiteLLM) Labels: `"team_id", "team_alias"`|
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM) Labels: `"hashed_api_key", "api_key_alias"`|
| `litellm_remaining_api_key_requests_for_model` | Remaining Requests for a LiteLLM virtual API key, only if a model-specific rate limit (rpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`|
| `litellm_remaining_api_key_tokens_for_model` | Remaining Tokens for a LiteLLM virtual API key, only if a model-specific token limit (tpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`|
## [BETA] Custom Metrics
Track custom metrics on prometheus on all events mentioned above.
@ -200,7 +223,6 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
... "metadata_foo": "hello world" ...
```
## Monitor System Health
To monitor the health of litellm adjacent services (redis / postgres), do:

View file

@ -0,0 +1,40 @@
# [BETA] Public Teams
Expose available teams to your users to join on signup.
<iframe width="840" height="500" src="https://www.loom.com/embed/7871ea15035a48d2a118b7486c2f7598?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
## Quick Start
1. Create a team on LiteLLM
```bash
curl -X POST '<PROXY_BASE_URL>/team/new' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <MASTER_KEY>' \
-d '{"name": "My Team", "team_id": "team_id_1"}'
```
2. Expose the team to your users
```yaml
litellm_settings:
default_internal_user_params:
available_teams: ["team_id_1"] # 👈 Make team available to new SSO users
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/team/member_add' \
-H 'Authorization: Bearer sk-<USER_KEY>' \
-H 'Content-Type: application/json' \
--data-raw '{
"team_id": "team_id_1",
"member": [{"role": "user", "user_id": "my-test-user"}]
}'
```

View file

@ -0,0 +1,12 @@
# Release Cycle
Litellm Proxy has the following release cycle:
- `v1.x.x-nightly`: These are releases which pass ci/cd.
- `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
In production, we recommend using the latest `v1.x.x` release.
Follow our release notes [here](https://github.com/BerriAI/litellm/releases).

View file

@ -1007,7 +1007,34 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
}'
```
### Disable Fallbacks per key
### Disable Fallbacks (Per Request/Key)
<Tabs>
<TabItem value="request" label="Per Request">
You can disable fallbacks per key by setting `disable_fallbacks: true` in your request body.
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"messages": [
{
"role": "user",
"content": "List 5 important events in the XIX century"
}
],
"model": "gpt-3.5-turbo",
"disable_fallbacks": true # 👈 DISABLE FALLBACKS
}'
```
</TabItem>
<TabItem value="key" label="Per Key">
You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.
@ -1021,3 +1048,6 @@ curl -L -X POST 'http://0.0.0.0:4000/key/generate' \
}
}'
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,23 @@
# Request Headers
Special headers that are supported by LiteLLM.
## LiteLLM Headers
`x-litellm-timeout` Optional[float]: The timeout for the request in seconds.
`x-litellm-enable-message-redaction`: Optional[bool]: Don't log the message content to logging integrations. Just track spend. [Learn More](./logging#redact-messages-response-content)
`x-litellm-tags`: Optional[str]: A comma separated list (e.g. `tag1,tag2,tag3`) of tags to use for [tag-based routing](./tag_routing) **OR** [spend-tracking](./enterprise.md#tracking-spend-for-custom-tags).
## Anthropic Headers
`anthropic-version` Optional[str]: The version of the Anthropic API to use.
`anthropic-beta` Optional[str]: The beta version of the Anthropic API to use.
## OpenAI Headers
`openai-organization` Optional[str]: The organization to use for the OpenAI API. (currently needs to be enabled via `general_settings::forward_openai_org_id: true`)

View file

@ -1,17 +1,20 @@
# Rate Limit Headers
# Response Headers
When you make a request to the proxy, the proxy will return the following [OpenAI-compatible headers](https://platform.openai.com/docs/guides/rate-limits/rate-limits-in-headers):
When you make a request to the proxy, the proxy will return the following headers:
- `x-ratelimit-remaining-requests` - Optional[int]: The remaining number of requests that are permitted before exhausting the rate limit.
- `x-ratelimit-remaining-tokens` - Optional[int]: The remaining number of tokens that are permitted before exhausting the rate limit.
- `x-ratelimit-limit-requests` - Optional[int]: The maximum number of requests that are permitted before exhausting the rate limit.
- `x-ratelimit-limit-tokens` - Optional[int]: The maximum number of tokens that are permitted before exhausting the rate limit.
- `x-ratelimit-reset-requests` - Optional[int]: The time at which the rate limit will reset.
- `x-ratelimit-reset-tokens` - Optional[int]: The time at which the rate limit will reset.
## Rate Limit Headers
[OpenAI-compatible headers](https://platform.openai.com/docs/guides/rate-limits/rate-limits-in-headers):
These headers are useful for clients to understand the current rate limit status and adjust their request rate accordingly.
| Header | Type | Description |
|--------|------|-------------|
| `x-ratelimit-remaining-requests` | Optional[int] | The remaining number of requests that are permitted before exhausting the rate limit |
| `x-ratelimit-remaining-tokens` | Optional[int] | The remaining number of tokens that are permitted before exhausting the rate limit |
| `x-ratelimit-limit-requests` | Optional[int] | The maximum number of requests that are permitted before exhausting the rate limit |
| `x-ratelimit-limit-tokens` | Optional[int] | The maximum number of tokens that are permitted before exhausting the rate limit |
| `x-ratelimit-reset-requests` | Optional[int] | The time at which the rate limit will reset |
| `x-ratelimit-reset-tokens` | Optional[int] | The time at which the rate limit will reset |
## How are these headers calculated?
### How Rate Limit Headers work
**If key has rate limits set**
@ -19,6 +22,50 @@ The proxy will return the [remaining rate limits for that key](https://github.co
**If key does not have rate limits set**
The proxy returns the remaining requests/tokens returned by the backend provider.
The proxy returns the remaining requests/tokens returned by the backend provider. (LiteLLM will standardize the backend provider's response headers to match the OpenAI format)
If the backend provider does not return these headers, the value will be `None`.
These headers are useful for clients to understand the current rate limit status and adjust their request rate accordingly.
## Latency Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-response-duration-ms` | float | Total duration of the API response in milliseconds |
| `x-litellm-overhead-duration-ms` | float | LiteLLM processing overhead in milliseconds |
## Retry, Fallback Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-attempted-retries` | int | Number of retry attempts made |
| `x-litellm-attempted-fallbacks` | int | Number of fallback attempts made |
| `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
## Cost Tracking Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-response-cost` | float | Cost of the API call |
| `x-litellm-key-spend` | float | Total spend for the API key |
## LiteLLM Specific Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-call-id` | string | Unique identifier for the API call |
| `x-litellm-model-id` | string | Unique identifier for the model used |
| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
| `x-litellm-version` | string | Version of LiteLLM being used |
| `x-litellm-model-group` | string | Model group identifier |
## Response headers from LLM providers
LiteLLM also returns the original response headers from the LLM provider. These headers are prefixed with `llm_provider-` to distinguish them from LiteLLM's headers.
Example response headers:
```
llm_provider-openai-processing-ms: 256
llm_provider-openai-version: 2020-10-01
llm_provider-x-ratelimit-limit-requests: 30000
llm_provider-x-ratelimit-limit-tokens: 150000000
```

View file

@ -143,6 +143,26 @@ Response
}
```
## Calling via Request Header
You can also call via request header `x-litellm-tags`
```shell
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-H 'x-litellm-tags: free,my-custom-tag' \
-d '{
"model": "gpt-4",
"messages": [
{
"role": "user",
"content": "Hey, how'\''s it going 123456?"
}
]
}'
```
## Setting Default Tags
Use this if you want all untagged requests to be routed to specific deployments

View file

@ -166,7 +166,7 @@ response = client.chat.completions.create(
{"role": "user", "content": "what color is red"}
],
logit_bias={12481: 100},
timeout=1
extra_body={"timeout": 1} # 👈 KEY CHANGE
)
print(response)

View file

@ -1,9 +1,9 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# JWT-based Auth
# OIDC - JWT-based Auth
Use JWT's to auth admins / projects into the proxy.
Use JWT's to auth admins / users / projects into the proxy.
:::info
@ -156,33 +156,115 @@ scope: ["litellm-proxy-admin",...]
scope: "litellm-proxy-admin ..."
```
## Enforce Role-Based Access Control (RBAC)
## Control model access with Teams
Reject a JWT token if it's valid but doesn't have the required scopes / fields.
Only tokens which with valid Admin (`admin_jwt_scope`), User (`user_id_jwt_field`), Team (`team_id_jwt_field`) are allowed.
1. Specify the JWT field that contains the team ids, that the user belongs to.
```yaml
general_settings:
enable_jwt_auth: True
litellm_jwtauth:
user_id_jwt_field: "sub"
team_ids_jwt_field: "groups"
user_id_upsert: true # add user_id to the db if they don't exist
enforce_team_based_model_access: true # don't allow users to access models unless the team has access
```
This is assuming your token looks like this:
```
{
...,
"sub": "my-unique-user",
"groups": ["team_id_1", "team_id_2"]
}
```
2. Create the teams on LiteLLM
```bash
curl -X POST '<PROXY_BASE_URL>/team/new' \
-H 'Authorization: Bearer <PROXY_MASTER_KEY>' \
-H 'Content-Type: application/json' \
-D '{
"team_alias": "team_1",
"team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID
}'
```
3. Test the flow
SSO for UI: [**See Walkthrough**](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e)
OIDC Auth for API: [**See Walkthrough**](https://www.loom.com/share/00fe2deab59a426183a46b1e2b522200?sid=4ed6d497-ead6-47f9-80c0-ca1c4b6b4814)
### Flow
- Validate if user id is in the DB (LiteLLM_UserTable)
- Validate if any of the groups are in the DB (LiteLLM_TeamTable)
- Validate if any group has model access
- If all checks pass, allow the request
## Advanced - Custom Validate
Validate a JWT Token using custom logic, if you need an extra way to verify if tokens are valid for LiteLLM Proxy.
### 1. Setup custom validate function
```python
from typing import Literal
def my_custom_validate(token: str) -> Literal[True]:
"""
Only allow tokens with tenant-id == "my-unique-tenant", and claims == ["proxy-admin"]
"""
allowed_tenants = ["my-unique-tenant"]
allowed_claims = ["proxy-admin"]
if token["tenant_id"] not in allowed_tenants:
raise Exception("Invalid JWT token")
if token["claims"] not in allowed_claims:
raise Exception("Invalid JWT token")
return True
```
### 2. Setup config.yaml
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
admin_jwt_scope: "litellm_proxy_endpoints_access"
admin_allowed_routes:
- openai_routes
- info_routes
public_key_ttl: 600
enforce_rbac: true # 👈 Enforce RBAC
user_id_jwt_field: "sub"
team_id_jwt_field: "tenant_id"
user_id_upsert: True
custom_validate: custom_validate.my_custom_validate # 👈 custom validate function
```
Expected Scope in JWT:
### 3. Test the flow
**Expected JWT**
```
{
"scope": "litellm_proxy_endpoints_access"
"sub": "my-unique-user",
"tenant_id": "INVALID_TENANT",
"claims": ["proxy-admin"]
}
```
**Expected Response**
```
{
"error": "Invalid JWT token"
}
```
## Advanced - Allowed Routes
Configure which routes a JWT can access via the config.
@ -288,3 +370,128 @@ general_settings:
user_allowed_email_domain: "my-co.com" # allows user@my-co.com to call proxy
user_id_upsert: true # 👈 upserts the user to db, if valid email but not in db
```
## [BETA] Control Access with OIDC Roles
Allow JWT tokens with supported roles to access the proxy.
Let users and teams access the proxy, without needing to add them to the DB.
Very important, set `enforce_rbac: true` to ensure that the RBAC system is enabled.
**Note:** This is in beta and might change unexpectedly.
```yaml
general_settings:
enable_jwt_auth: True
litellm_jwtauth:
object_id_jwt_field: "oid" # can be either user / team, inferred from the role mapping
roles_jwt_field: "roles"
role_mappings:
- role: litellm.api.consumer
internal_role: "team"
enforce_rbac: true # 👈 VERY IMPORTANT
role_permissions: # default model + endpoint permissions for a role.
- role: team
models: ["anthropic-claude"]
routes: ["/v1/chat/completions"]
environment_variables:
JWT_AUDIENCE: "api://LiteLLM_Proxy" # ensures audience is validated
```
- `object_id_jwt_field`: The field in the JWT token that contains the object id. This id can be either a user id or a team id. Use this instead of `user_id_jwt_field` and `team_id_jwt_field`. If the same field could be both.
- `roles_jwt_field`: The field in the JWT token that contains the roles. This field is a list of roles that the user has. To index into a nested field, use dot notation - eg. `resource_access.litellm-test-client-id.roles`.
- `role_mappings`: A list of role mappings. Map the received role in the JWT token to an internal role on LiteLLM.
- `JWT_AUDIENCE`: The audience of the JWT token. This is used to validate the audience of the JWT token. Set via an environment variable.
### Example Token
```bash
{
"aud": "api://LiteLLM_Proxy",
"oid": "eec236bd-0135-4b28-9354-8fc4032d543e",
"roles": ["litellm.api.consumer"]
}
```
### Role Mapping Spec
- `role`: The expected role in the JWT token.
- `internal_role`: The internal role on LiteLLM that will be used to control access.
Supported internal roles:
- `team`: Team object will be used for RBAC spend tracking. Use this for tracking spend for a 'use case'.
- `internal_user`: User object will be used for RBAC spend tracking. Use this for tracking spend for an 'individual user'.
- `proxy_admin`: Proxy admin will be used for RBAC spend tracking. Use this for granting admin access to a token.
### [Architecture Diagram (Control Model Access)](./jwt_auth_arch)
## [BETA] Control Model Access with Scopes
Control which models a JWT can access. Set `enforce_scope_based_access: true` to enforce scope-based access control.
### 1. Setup config.yaml with scope mappings.
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: anthropic/claude-3-5-sonnet
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: gpt-3.5-turbo-testing
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
general_settings:
enable_jwt_auth: True
litellm_jwtauth:
team_id_jwt_field: "client_id" # 👈 set the field in the JWT token that contains the team id
team_id_upsert: true # 👈 upsert the team to db, if team id is not found in db
scope_mappings:
- scope: litellm.api.consumer
models: ["anthropic-claude"]
- scope: litellm.api.gpt_3_5_turbo
models: ["gpt-3.5-turbo-testing"]
enforce_scope_based_access: true # 👈 enforce scope-based access control
enforce_rbac: true # 👈 enforces only a Team/User/ProxyAdmin can access the proxy.
```
#### Scope Mapping Spec
- `scope`: The scope to be used for the JWT token.
- `models`: The models that the JWT token can access. Value is the `model_name` in `model_list`. Note: Wildcard routes are not currently supported.
### 2. Create a JWT with the correct scopes.
Expected Token:
```bash
{
"scope": ["litellm.api.consumer", "litellm.api.gpt_3_5_turbo"] # can be a list or a space-separated string
}
```
### 3. Test the flow.
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer eyJhbGci...' \
-d '{
"model": "gpt-3.5-turbo-testing",
"messages": [
{
"role": "user",
"content": "Hey, how'\''s it going 1234?"
}
]
}'
```

View file

@ -6,11 +6,6 @@ import TabItem from '@theme/TabItem';
Create keys, track spend, add models without worrying about the config / CRUD endpoints.
:::info
This is in beta, so things may change. If you have feedback, [let us know](https://discord.com/invite/wuPM9dRgDw)
:::
<Image img={require('../../img/litellm_ui_create_key.png')} />

View file

@ -1,11 +1,11 @@
import Image from '@theme/IdealImage';
# User Management Heirarchy
# User Management Hierarchy
<Image img={require('../../img/litellm_user_heirarchy.png')} style={{ width: '100%', maxWidth: '4000px' }} />
LiteLLM supports a heirarchy of users, teams, organizations, and budgets.
LiteLLM supports a hierarchy of users, teams, organizations, and budgets.
- Organizations can have multiple teams. [API Reference](https://litellm-api.up.railway.app/#/organization%20management)
- Teams can have multiple users. [API Reference](https://litellm-api.up.railway.app/#/team%20management)

View file

@ -393,55 +393,6 @@ curl -L -X POST 'http://0.0.0.0:4000/key/unblock' \
```
### Custom Auth
You can now override the default api key auth.
Here's how:
#### 1. Create a custom auth file.
Make sure the response type follows the `UserAPIKeyAuth` pydantic object. This is used by for logging usage specific to that user key.
```python
from litellm.proxy._types import UserAPIKeyAuth
async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
try:
modified_master_key = "sk-my-master-key"
if api_key == modified_master_key:
return UserAPIKeyAuth(api_key=api_key)
raise Exception
except:
raise Exception
```
#### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
```yaml
model_list:
- model_name: "openai-model"
litellm_params:
model: "gpt-3.5-turbo"
litellm_settings:
drop_params: True
set_verbose: True
general_settings:
custom_auth: custom_auth.user_api_key_auth
```
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
#### 3. Start the proxy
```shell
$ litellm --config /path/to/config.yaml
```
### Custom /key/generate
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
@ -568,6 +519,61 @@ litellm_settings:
team_id: "core-infra"
```
### ✨ Key Rotations
:::info
This is an Enterprise feature.
[Enterprise Pricing](https://www.litellm.ai/#pricing)
[Get free 7-day trial key](https://www.litellm.ai/#trial)
:::
Rotate an existing API Key, while optionally updating its parameters.
```bash
curl 'http://localhost:4000/key/sk-1234/regenerate' \
-X POST \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"max_budget": 100,
"metadata": {
"team": "core-infra"
},
"models": [
"gpt-4",
"gpt-3.5-turbo"
]
}'
```
**Read More**
- [Write rotated keys to secrets manager](https://docs.litellm.ai/docs/secret#aws-secret-manager)
[**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/key%20management/regenerate_key_fn_key__key__regenerate_post)
### Temporary Budget Increase
Use the `/key/update` endpoint to increase the budget of an existing key.
```bash
curl -L -X POST 'http://localhost:4000/key/update' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"key": "sk-b3Z3Lqdb_detHXSUp4ol4Q", "temp_budget_increase": 100, "temp_budget_expiry": "10d"}'
```
[API Reference](https://litellm-api.up.railway.app/#/key%20management/update_key_fn_key_update_post)
### Restricting Key Generation
Use this to control who can generate keys. Useful when letting others create keys on the UI.

View file

@ -0,0 +1,357 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 'Thinking' / 'Reasoning Content'
Supported Providers:
- Deepseek (`deepseek/`)
- Anthropic API (`anthropic/`)
- Bedrock (Anthropic + Deepseek) (`bedrock/`)
- Vertex AI (Anthropic) (`vertexai/`)
```python
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris.",
"signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
}
]
}
```
## Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["ANTHROPIC_API_KEY"] = ""
response = completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[
{"role": "user", "content": "What is the capital of France?"},
],
thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`)
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "anthropic/claude-3-7-sonnet-20250219",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```bash
{
"id": "3b66124d79a708e10c603496b363574c",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": " won the FIFA World Cup in 2022.",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1723323084,
"model": "deepseek/deepseek-chat",
"object": "chat.completion",
"system_fingerprint": "fp_7e0991cad4",
"usage": {
"completion_tokens": 12,
"prompt_tokens": 16,
"total_tokens": 28,
},
"service_tier": null
}
```
## Tool Calling with `thinking`
Here's how to use `thinking` blocks by Anthropic with tool calling.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
litellm._turn_on_debug()
litellm.modify_params = True
model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "user",
"content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
}
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model=model,
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
thinking={"type": "enabled", "budget_tokens": 1024},
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
print("Expecting there to be 3 tool calls")
assert (
len(tool_calls) > 0
) # this has to call the function for SF, Tokyo and paris
# Step 2: check if the model wanted to call a function
print(f"tool_calls: {tool_calls}")
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
} # only one function in this example, but you can have multiple
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
if function_name not in available_functions:
# the model called a function that does not exist in available_functions - don't try calling anything
return
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model=model,
messages=messages,
seed=22,
# tools=tools,
drop_params=True,
thinking={"type": "enabled", "budget_tokens": 1024},
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-7-sonnet-thinking
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
thinking: {
"type": "enabled",
"budget_tokens": 1024
}
```
2. Run proxy
```bash
litellm --config config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Make 1st call
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
],
"tool_choice": "auto"
}'
```
4. Make 2nd call with tool call results
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{
"role": "user",
"content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
},
{
"role": "assistant",
"content": "I\'ll check the current weather for these three cities for you:",
"tool_calls": [
{
"index": 2,
"function": {
"arguments": "{\"location\": \"San Francisco\"}",
"name": "get_current_weather"
},
"id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"type": "function"
}
],
"function_call": null,
"reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
}
],
"provider_specific_fields": {
"reasoningContentBlocks": [
{
"reasoningText": {
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
"text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
}
}
]
}
},
{
"tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"role": "tool",
"name": "get_current_weather",
"content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
}
]
}'
```
</TabItem>
</Tabs>
## Switching between Anthropic + Deepseek models
Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
```python
litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
# or per request
## Anthropic
response = litellm.completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
## Deepseek
response = litellm.completion(
model="deepseek/deepseek-chat",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
```
## Spec
These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
- `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
- `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
- `type` - str: The type of thinking block.
- `thinking` - str: The thinking from the model.
- `signature` - str: The signature delta from the model.

View file

@ -111,7 +111,7 @@ curl http://0.0.0.0:4000/rerank \
| Provider | Link to Usage |
|-------------|--------------------|
| Cohere | [Usage](#quick-start) |
| Cohere (v1 + v2 clients) | [Usage](#quick-start) |
| Together AI| [Usage](../docs/providers/togetherai) |
| Azure AI| [Usage](../docs/providers/azure_ai) |
| Jina AI| [Usage](../docs/providers/jina_ai) |

View file

@ -826,6 +826,65 @@ asyncio.run(router_acompletion())
## Basic Reliability
### Weighted Deployments
Set `weight` on a deployment to pick one deployment more often than others.
This works across **ALL** routing strategies.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
model_list = [
{
"model_name": "o1",
"litellm_params": {
"model": "o1-preview",
"api_key": os.getenv("OPENAI_API_KEY"),
"weight": 1
},
},
{
"model_name": "o1",
"litellm_params": {
"model": "o1-preview",
"api_key": os.getenv("OPENAI_API_KEY"),
"weight": 2 # 👈 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview
},
},
]
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o1
litellm_params:
model: o1
api_key: os.environ/OPENAI_API_KEY
weight: 1
- model_name: o1
litellm_params:
model: o1-preview
api_key: os.environ/OPENAI_API_KEY
weight: 2 # 👈 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview
```
</TabItem>
</Tabs>
### Max Parallel Requests (ASYNC)
Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios.
@ -893,8 +952,8 @@ router_settings:
```
Defaults:
- allowed_fails: 0
- cooldown_time: 60s
- allowed_fails: 3
- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)
**Set Per Model**

View file

@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
```
### Using K/V pairs in 1 AWS Secret
You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
```yaml
general_settings:
key_management_system: "aws_secret_manager"
key_management_settings:
hosted_keys: [
"OPENAI_API_KEY_MODEL_1",
"OPENAI_API_KEY_MODEL_2",
]
primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
```
The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
```json
{
"OPENAI_API_KEY_MODEL_1": "sk-key1...",
"OPENAI_API_KEY_MODEL_2": "sk-key2..."
}
```
This reduces the number of AWS Secrets you need to manage.
## Hashicorp Vault
@ -353,4 +380,7 @@ general_settings:
# Hosted Keys Settings
hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
# K/V pairs in 1 AWS Secret Settings
primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
```

View file

@ -30,6 +30,7 @@ import os
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "Your API Key"
os.environ["ANTHROPIC_API_KEY"] = "Your API Key"
os.environ["XAI_API_KEY"] = "Your API Key"
os.environ["REPLICATE_API_KEY"] = "Your API Key"
os.environ["TOGETHERAI_API_KEY"] = "Your API Key"
```

View file

@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Use LiteLLM AI Gateway with Aporia Guardrails
# Aporia Guardrails with LiteLLM Gateway
In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses
## 1. Setup guardrails on Aporia

View file

@ -0,0 +1,103 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenWeb UI with LiteLLM
This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to
- Access 100+ LLMs on OpenWeb UI
- Track Spend / Usage, Set Budget Limits
- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
- Set access controls eg. Control what models OpenWebUI can access.
## Quickstart
- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
## 1. Start LiteLLM & OpenWebUI
- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
## 2. Create a Virtual Key on LiteLLM
Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
### 2.1 LiteLLM User Management Hierarchy
On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
### 2.2 Create a Team on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
<Image img={require('../../img/litellm_create_team.gif')} />
### 2.2 Create a Virtual Key on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key.
LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
<Image img={require('../../img/create_key_in_team_oweb.gif')} />
## 3. Connect OpenWeb UI to LiteLLM
On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
Enter the following details:
- URL: `http://localhost:4000` (your litellm proxy base url)
- Key: `your-virtual-key` (the key you created in the previous step)
<Image img={require('../../img/litellm_setup_openweb.gif')} />
### 3.1 Test Request
On the top left corner, select models you should only see the models you gave the key access to in Step 2.
Once you selected a model, enter your message content and click on `Submit`
<Image img={require('../../img/basic_litellm.gif')} />
### 3.2 Tracking Spend / Usage
After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
## Render `thinking` content on OpenWeb UI
OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
Example litellm config.yaml:
```yaml
model_list:
- model_name: thinking-anthropic-claude-3-7-sonnet
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024}
max_tokens: 1080
merge_reasoning_content_in_choices: true
```
### Test it on OpenWeb UI
On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
<Image img={require('../../img/litellm_thinking_openweb.gif')} />

View file

@ -44,7 +44,7 @@ const config = {
path: './release_notes',
routeBasePath: 'release_notes',
blogTitle: 'Release Notes',
blogSidebarTitle: 'All Releases',
blogSidebarTitle: 'Releases',
blogSidebarCount: 'ALL',
postsPerPage: 'ALL',
showReadingTime: false,

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Some files were not shown because too many files have changed in this diff Show more