Compare commits

..

4 commits

Author SHA1 Message Date
Rajat Vig
320032deed
fix non root docker image (#6744)
Signed-off-by: Rajat Vig <rvig@etsy.com>
2024-11-14 08:16:18 -08:00
Ishaan Jaff
0892975434 run prisma generate as nobody 2024-11-12 11:10:54 -08:00
Ishaan Jaff
1c0ea3802f fix tmp dir non root 2024-11-07 20:08:26 -08:00
Ishaan Jaff
af9ea654f2 non root fixes 2024-11-07 19:42:12 -08:00
436 changed files with 9466 additions and 26108 deletions

View file

@ -625,90 +625,6 @@ jobs:
paths:
- llm_translation_coverage.xml
- llm_translation_coverage
pass_through_unit_testing:
docker:
- image: cimg/python:3.11
auth:
username: ${DOCKERHUB_USERNAME}
password: ${DOCKERHUB_PASSWORD}
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-cov==5.0.0"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/pass_through_unit_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
- run:
name: Rename the coverage files
command: |
mv coverage.xml pass_through_unit_tests_coverage.xml
mv .coverage pass_through_unit_tests_coverage
# Store test results
- store_test_results:
path: test-results
- persist_to_workspace:
root: .
paths:
- pass_through_unit_tests_coverage.xml
- pass_through_unit_tests_coverage
image_gen_testing:
docker:
- image: cimg/python:3.11
auth:
username: ${DOCKERHUB_USERNAME}
password: ${DOCKERHUB_PASSWORD}
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-cov==5.0.0"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/image_gen_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
- run:
name: Rename the coverage files
command: |
mv coverage.xml image_gen_coverage.xml
mv .coverage image_gen_coverage
# Store test results
- store_test_results:
path: test-results
- persist_to_workspace:
root: .
paths:
- image_gen_coverage.xml
- image_gen_coverage
logging_testing:
docker:
- image: cimg/python:3.11
@ -728,11 +644,7 @@ jobs:
pip install "pytest-retry==1.6.3"
pip install "pytest-cov==5.0.0"
pip install "pytest-asyncio==0.21.1"
pip install pytest-mock
pip install "respx==0.21.1"
pip install "google-generativeai==0.3.2"
pip install "google-cloud-aiplatform==1.43.0"
pip install "mlflow==2.17.2"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
@ -807,14 +719,11 @@ jobs:
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
- run: ruff check ./litellm
# - run: python ./tests/documentation_tests/test_general_setting_keys.py
- run: python ./tests/documentation_tests/test_general_setting_keys.py
- run: python ./tests/code_coverage_tests/router_code_coverage.py
- run: python ./tests/code_coverage_tests/test_router_strategy_async.py
- run: python ./tests/code_coverage_tests/litellm_logging_code_coverage.py
- run: python ./tests/documentation_tests/test_env_keys.py
- run: python ./tests/documentation_tests/test_router_settings.py
- run: python ./tests/documentation_tests/test_api_docs.py
- run: python ./tests/code_coverage_tests/ensure_async_clients_test.py
- run: helm lint ./deploy/charts/litellm-helm
db_migration_disable_update_check:
@ -966,7 +875,7 @@ jobs:
command: |
pwd
ls
python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation
no_output_timeout: 120m
# Store test results
@ -1053,7 +962,6 @@ jobs:
-e AWS_REGION_NAME=$AWS_REGION_NAME \
-e APORIA_API_KEY_1=$APORIA_API_KEY_1 \
-e COHERE_API_KEY=$COHERE_API_KEY \
-e GCS_FLUSH_INTERVAL="1" \
--name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \
-v $(pwd)/litellm/proxy/example_config_yaml/custom_guardrail.py:/app/custom_guardrail.py \
@ -1083,48 +991,6 @@ jobs:
ls
python -m pytest -vv tests/otel_tests -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
# Clean up first container
- run:
name: Stop and remove first container
command: |
docker stop my-app
docker rm my-app
# Second Docker Container Run with Different Config
# NOTE: We intentionally pass a "bad" license here. We need to ensure proxy starts and serves request even with bad license
- run:
name: Run Second Docker container
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \
-e REDIS_PORT=$REDIS_PORT \
-e LITELLM_MASTER_KEY="sk-1234" \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e LITELLM_LICENSE="bad-license" \
--name my-app-3 \
-v $(pwd)/litellm/proxy/example_config_yaml/enterprise_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--detailed_debug
- run:
name: Start outputting logs for second container
command: docker logs -f my-app-2
background: true
- run:
name: Wait for second app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
- run:
name: Run second round of tests
command: |
python -m pytest -vv tests/basic_proxy_startup_tests -x --junitxml=test-results/junit-2.xml --durations=5
no_output_timeout: 120m
# Store test results
- store_test_results:
@ -1179,8 +1045,6 @@ jobs:
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
pip install "google-cloud-aiplatform==1.59.0"
pip install anthropic
# Run pytest and generate JUnit XML report
- run:
name: Build Docker image
command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1192,8 +1056,6 @@ jobs:
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e LITELLM_MASTER_KEY="sk-1234" \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e GEMINI_API_KEY=$GEMINI_API_KEY \
-e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
-e LITELLM_LICENSE=$LITELLM_LICENSE \
--name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/pass_through_config.yaml:/app/config.yaml \
@ -1217,27 +1079,6 @@ jobs:
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
# New steps to run Node.js test
- run:
name: Install Node.js
command: |
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get install -y nodejs
node --version
npm --version
- run:
name: Install Node.js dependencies
command: |
npm install @google-cloud/vertexai
npm install @google/generative-ai
npm install --save-dev jest
- run:
name: Run Vertex AI, Google AI Studio Node.js tests
command: |
npx jest tests/pass_through_tests --verbose
no_output_timeout: 30m
- run:
name: Run tests
command: |
@ -1245,6 +1086,7 @@ jobs:
ls
python -m pytest -vv tests/pass_through_tests/ -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
# Store test results
- store_test_results:
path: test-results
@ -1270,7 +1112,7 @@ jobs:
python -m venv venv
. venv/bin/activate
pip install coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage
coverage xml
- codecov/upload:
file: ./coverage.xml
@ -1376,7 +1218,6 @@ jobs:
name: Install Dependencies
command: |
npm install -D @playwright/test
npm install @google-cloud/vertexai
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
@ -1408,7 +1249,7 @@ jobs:
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL_2 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e LITELLM_MASTER_KEY="sk-1234" \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e UI_USERNAME="admin" \
@ -1438,7 +1279,7 @@ jobs:
- run:
name: Run Playwright Tests
command: |
npx playwright test e2e_ui_tests/ --reporter=html --output=test-results
npx playwright test --reporter=html --output=test-results
no_output_timeout: 120m
- store_test_results:
path: test-results
@ -1560,18 +1401,6 @@ workflows:
only:
- main
- /litellm_.*/
- pass_through_unit_testing:
filters:
branches:
only:
- main
- /litellm_.*/
- image_gen_testing:
filters:
branches:
only:
- main
- /litellm_.*/
- logging_testing:
filters:
branches:
@ -1581,8 +1410,6 @@ workflows:
- upload-coverage:
requires:
- llm_translation_testing
- pass_through_unit_testing
- image_gen_testing
- logging_testing
- litellm_router_testing
- caching_unit_tests
@ -1622,8 +1449,6 @@ workflows:
- load_testing
- test_bad_database_url
- llm_translation_testing
- pass_through_unit_testing
- image_gen_testing
- logging_testing
- litellm_router_testing
- caching_unit_tests

View file

@ -9,3 +9,4 @@ tests
.devcontainer
*.tgz
log.txt
docker/Dockerfile.*

View file

@ -113,7 +113,7 @@ for part in response:
## Logging Observability ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack, MLflow
LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack
```python
from litellm import completion
@ -305,36 +305,6 @@ Step 4: Submit a PR with your changes! 🚀
- push your fork to your GitHub repo
- submit a PR from there
### Building LiteLLM Docker Image
Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
Step 1: Clone the repo
```
git clone https://github.com/BerriAI/litellm.git
```
Step 2: Build the Docker Image
Build using Dockerfile.non_root
```
docker build -f docker/Dockerfile.non_root -t litellm_test_image .
```
Step 3: Run the Docker Image
Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
```
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://xxxxxxxx" \
-e LITELLM_MASTER_KEY="sk-1234" \
-p 4000:4000 \
litellm_test_image \
--config /app/config.yaml --detailed_debug
```
# Enterprise
For companies that need better security, user management and professional support

View file

@ -1,30 +0,0 @@
# This job runs the prisma migrations for the LiteLLM DB.
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "litellm.fullname" . }}-migrations
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: Never # keep this resource so we can debug status on ArgoCD
checksum/config: {{ toYaml .Values | sha256sum }}
spec:
template:
spec:
containers:
- name: prisma-migrations
image: ghcr.io/berriai/litellm-database:main-latest
command: ["python", "litellm/proxy/prisma_migration.py"]
workingDir: "/app"
env:
{{- if .Values.db.useExisting }}
- name: DATABASE_URL
value: {{ .Values.db.url | quote }}
{{- else }}
- name: DATABASE_URL
value: postgresql://{{ .Values.postgresql.auth.username }}:{{ .Values.postgresql.auth.password }}@{{ .Release.Name }}-postgresql/{{ .Values.postgresql.auth.database }}
{{- end }}
- name: DISABLE_SCHEMA_UPDATE
value: "false" # always run the migration from the Helm PreSync hook, override the value set
restartPolicy: OnFailure
backoffLimit: {{ .Values.migrationJob.backoffLimit }}

View file

@ -179,12 +179,3 @@ postgresql:
redis:
enabled: false
architecture: standalone
# Prisma migration job settings
migrationJob:
enabled: true # Enable or disable the schema migration Job
retries: 3 # Number of retries for the Job in case of failure
backoffLimit: 4 # Backoff limit for Job restarts
disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0.

View file

@ -9,13 +9,16 @@ FROM $LITELLM_BUILD_IMAGE AS builder
# Set the working directory to /app
WORKDIR /app
# Set the shell to bash
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Install build dependencies
RUN apt-get clean && apt-get update && \
apt-get install -y gcc python3-dev && \
rm -rf /var/lib/apt/lists/*
RUN pip install --upgrade pip && \
pip install build
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir build
# Copy the current directory contents into the container at /app
COPY . .
@ -57,22 +60,35 @@ RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl
RUN pip install redisvl==0.0.7 --no-deps
# ensure pyjwt is used, not jwt
RUN pip uninstall jwt -y
RUN pip uninstall PyJWT -y
RUN pip install PyJWT==2.9.0 --no-cache-dir
RUN pip uninstall jwt -y && \
pip uninstall PyJWT -y && \
pip install PyJWT==2.9.0 --no-cache-dir
# Build Admin UI
RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh
# Generate prisma client
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
RUN mkdir -p /.cache
RUN chmod -R 777 /.cache
RUN pip install nodejs-bin
RUN pip install prisma
RUN prisma generate
### Prisma Handling for Non-Root #################################################
# Prisma allows you to specify the binary cache directory to use
ENV PRISMA_BINARY_CACHE_DIR=/nonexistent
# Set the TMPDIR environment variable, when this does not exist prisma raises "Error: ENOENT: no such file or directory, lstat '/var/folders'""
ENV TMPDIR=/tmp
RUN mkdir -p /tmp && chmod 1777 /tmp
# Make a /non-existent folder and assign chown to nobody
RUN mkdir -p /nonexistent && \
chown -R nobody:nogroup /nonexistent && \
chown -R nobody:nogroup /usr/local/lib/python3.11/site-packages/prisma/
RUN chmod +x docker/entrypoint.sh
# Run Prisma generate as user = nobody
USER nobody
RUN pip install --no-cache-dir nodejs-bin prisma
RUN prisma generate
### End of Prisma Handling for Non-Root #########################################
EXPOSE 4000/tcp
# # Set your entrypoint and command

View file

@ -18,4 +18,3 @@
npm-debug.log*
yarn-debug.log*
yarn-error.log*
yarn.lock

View file

@ -0,0 +1,54 @@
# [BETA] Anthropic `/v1/messages`
Call 100+ LLMs in the Anthropic format.
1. Setup config.yaml
```yaml
model_list:
- model_name: my-test-model
litellm_params:
model: gpt-3.5-turbo
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/v1/messages' \
-H 'x-api-key: sk-1234' \
-H 'content-type: application/json' \
-D '{
"model": "my-test-model",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
## Test with Anthropic SDK
```python
import os
from anthropic import Anthropic
client = Anthropic(api_key="sk-1234", base_url="http://0.0.0.0:4000") # 👈 CONNECT TO PROXY
message = client.messages.create(
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="my-test-model", # 👈 set 'model_name'
)
print(message.content)
```

View file

@ -1,41 +0,0 @@
# Benchmarks
Benchmarks for LiteLLM Gateway (Proxy Server)
Locust Settings:
- 2500 Users
- 100 user Ramp Up
## Basic Benchmarks
Overhead when using a Deployed Proxy vs Direct to LLM
- Latency overhead added by LiteLLM Proxy: 107ms
| Metric | Direct to Fake Endpoint | Basic Litellm Proxy |
|--------|------------------------|---------------------|
| RPS | 1196 | 1133.2 |
| Median Latency (ms) | 33 | 140 |
## Logging Callbacks
### [GCS Bucket Logging](https://docs.litellm.ai/docs/proxy/bucket)
Using GCS Bucket has **no impact on latency, RPS compared to Basic Litellm Proxy**
| Metric | Basic Litellm Proxy | LiteLLM Proxy with GCS Bucket Logging |
|--------|------------------------|---------------------|
| RPS | 1133.2 | 1137.3 |
| Median Latency (ms) | 140 | 138 |
### [LangSmith logging](https://docs.litellm.ai/docs/proxy/logging)
Using LangSmith has **no impact on latency, RPS compared to Basic Litellm Proxy**
| Metric | Basic Litellm Proxy | LiteLLM Proxy with LangSmith |
|--------|------------------------|---------------------|
| RPS | 1133.2 | 1135 |
| Median Latency (ms) | 140 | 132 |

View file

@ -41,7 +41,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
| Provider | temperature | max_completion_tokens | max_tokens | top_p | stream | stream_options | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | | ✅ | ✅ | | | ✅ |
|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | | ✅ | ✅ | | | ✅ |
|OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
|Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ |
|Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |

View file

@ -75,9 +75,6 @@ Works for:
- Google AI Studio - Gemini models
- Vertex AI models (Gemini + Anthropic)
- Bedrock Models
- Anthropic API Models
- Groq Models
- Ollama Models
<Tabs>
<TabItem value="sdk" label="SDK">

View file

@ -93,7 +93,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
## Check Model Support
Call `litellm.get_model_info` to check if a model/provider supports `prefix`.
Call `litellm.get_model_info` to check if a model/provider supports `response_format`.
<Tabs>
<TabItem value="sdk" label="SDK">

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Embeddings
# Embedding Models
## Quick Start
```python

View file

@ -1,74 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Calling Finetuned Models
## OpenAI
| Model Name | Function Call |
|---------------------------|-----------------------------------------------------------------|
| fine tuned `gpt-4-0613` | `response = completion(model="ft:gpt-4-0613", messages=messages)` |
| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` |
| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` |
| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` |
| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |
## Vertex AI
Fine tuned models on vertex have a numerical model/endpoint id.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
## set ENV variables
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/<your-finetuned-model>", # e.g. vertex_ai/4965075652664360960
messages=[{ "content": "Hello, how are you?","role": "user"}],
base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add Vertex Credentials to your env
```bash
!gcloud auth application-default login
```
2. Setup config.yaml
```yaml
- model_name: finetuned-gemini
litellm_params:
model: vertex_ai/<ENDPOINT_ID>
vertex_project: <PROJECT_ID>
vertex_location: <LOCATION>
model_info:
base_model: vertex_ai/gemini-1.5-pro # IMPORTANT
```
3. Test it!
```bash
curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: <LITELLM_KEY>' \
--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
```
</TabItem>
</Tabs>

View file

@ -1,4 +1,4 @@
# Images
# Image Generation
## Quick Start

View file

@ -1,135 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Moderation
### Usage
<Tabs>
<TabItem value="python" label="LiteLLM Python SDK">
```python
from litellm import moderation
response = moderation(
input="hello from litellm",
model="text-moderation-stable"
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy Server">
For `/moderations` endpoint, there is **no need to specify `model` in the request or on the litellm config.yaml**
Start litellm proxy server
```
litellm
```
<Tabs>
<TabItem value="python" label="OpenAI Python SDK">
```python
from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.moderations.create(
input="hello from litellm",
model="text-moderation-stable" # optional, defaults to `omni-moderation-latest`
)
print(response)
```
</TabItem>
<TabItem value="curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/moderations' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
## Input Params
LiteLLM accepts and translates the [OpenAI Moderation params](https://platform.openai.com/docs/api-reference/moderations) across all supported providers.
### Required Fields
- `input`: *string or array* - Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
- If string: A string of text to classify for moderation
- If array of strings: An array of strings to classify for moderation
- If array of objects: An array of multi-modal inputs to the moderation model, where each object can be:
- An object describing an image to classify with:
- `type`: *string, required* - Always `image_url`
- `image_url`: *object, required* - Contains either an image URL or a data URL for a base64 encoded image
- An object describing text to classify with:
- `type`: *string, required* - Always `text`
- `text`: *string, required* - A string of text to classify
### Optional Fields
- `model`: *string (optional)* - The moderation model to use. Defaults to `omni-moderation-latest`.
## Output Format
Here's the exact json output and type you can expect from all moderation calls:
[**LiteLLM follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/moderations/object)
```python
{
"id": "modr-AB8CjOTu2jiq12hp1AQPfeqFWaORR",
"model": "text-moderation-007",
"results": [
{
"flagged": true,
"categories": {
"sexual": false,
"hate": false,
"harassment": true,
"self-harm": false,
"sexual/minors": false,
"hate/threatening": false,
"violence/graphic": false,
"self-harm/intent": false,
"self-harm/instructions": false,
"harassment/threatening": true,
"violence": true
},
"category_scores": {
"sexual": 0.000011726012417057063,
"hate": 0.22706663608551025,
"harassment": 0.5215635299682617,
"self-harm": 2.227119921371923e-6,
"sexual/minors": 7.107352217872176e-8,
"hate/threatening": 0.023547329008579254,
"violence/graphic": 0.00003391829886822961,
"self-harm/intent": 1.646940972932498e-6,
"self-harm/instructions": 1.1198755256458526e-9,
"harassment/threatening": 0.5694745779037476,
"violence": 0.9971134662628174
}
}
]
}
```
## **Supported Providers**
| Provider |
|-------------|
| OpenAI |

View file

@ -4,48 +4,9 @@ import TabItem from '@theme/TabItem';
# Argilla
Argilla is a collaborative annotation tool for AI engineers and domain experts who need to build high-quality datasets for their projects.
Argilla is a tool for annotating datasets.
## Getting Started
To log the data to Argilla, first you need to deploy the Argilla server. If you have not deployed the Argilla server, please follow the instructions [here](https://docs.argilla.io/latest/getting_started/quickstart/).
Next, you will need to configure and create the Argilla dataset.
```python
import argilla as rg
client = rg.Argilla(api_url="<api_url>", api_key="<api_key>")
settings = rg.Settings(
guidelines="These are some guidelines.",
fields=[
rg.ChatField(
name="user_input",
),
rg.TextField(
name="llm_output",
),
],
questions=[
rg.RatingQuestion(
name="rating",
values=[1, 2, 3, 4, 5, 6, 7],
),
],
)
dataset = rg.Dataset(
name="my_first_dataset",
settings=settings,
)
dataset.create()
```
For further configuration, please refer to the [Argilla documentation](https://docs.argilla.io/latest/how_to_guides/dataset/).
## Usage
@ -53,14 +14,14 @@ For further configuration, please refer to the [Argilla documentation](https://d
<Tab value="sdk" label="SDK">
```python
import os
import litellm
from litellm import completion
import litellm
import os
# add env vars
os.environ["ARGILLA_API_KEY"]="argilla.apikey"
os.environ["ARGILLA_BASE_URL"]="http://localhost:6900"
os.environ["ARGILLA_DATASET_NAME"]="my_first_dataset"
os.environ["ARGILLA_DATASET_NAME"]="my_second_dataset"
os.environ["OPENAI_API_KEY"]="sk-proj-..."
litellm.callbacks = ["argilla"]

View file

@ -1,108 +0,0 @@
# MLflow
## What is MLflow?
**MLflow** is an end-to-end open source MLOps platform for [experiment tracking](https://www.mlflow.org/docs/latest/tracking.html), [model management](https://www.mlflow.org/docs/latest/models.html), [evaluation](https://www.mlflow.org/docs/latest/llms/llm-evaluate/index.html), [observability (tracing)](https://www.mlflow.org/docs/latest/llms/tracing/index.html), and [deployment](https://www.mlflow.org/docs/latest/deployment/index.html). MLflow empowers teams to collaboratively develop and refine LLM applications efficiently.
MLflows integration with LiteLLM supports advanced observability compatible with OpenTelemetry.
<Image img={require('../../img/mlflow_tracing.png')} />
## Getting Started
Install MLflow:
```shell
pip install mlflow
```
To enable LiteLLM tracing:
```python
import mlflow
mlflow.litellm.autolog()
# Alternative, you can set the callback manually in LiteLLM
# litellm.callbacks = ["mlflow"]
```
Since MLflow is open-source, no sign-up or API key is needed to log traces!
```
import litellm
import os
# Set your LLM provider's API key
os.environ["OPENAI_API_KEY"] = ""
# Call LiteLLM as usual
response = litellm.completion(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
Open the MLflow UI and go to the `Traces` tab to view logged traces:
```bash
mlflow ui
```
## Exporting Traces to OpenTelemetry collectors
MLflow traces are compatible with OpenTelemetry. You can export traces to any OpenTelemetry collector (e.g., Jaeger, Zipkin, Datadog, New Relic) by setting the endpoint URL in the environment variables.
```
# Set the endpoint of the OpenTelemetry Collector
os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = "http://localhost:4317/v1/traces"
# Optionally, set the service name to group traces
os.environ["OTEL_SERVICE_NAME"] = "<your-service-name>"
```
See [MLflow documentation](https://mlflow.org/docs/latest/llms/tracing/index.html#using-opentelemetry-collector-for-exporting-traces) for more details.
## Combine LiteLLM Trace with Your Application Trace
LiteLLM is often part of larger LLM applications, such as agentic models. MLflow Tracing allows you to instrument custom Python code, which can then be combined with LiteLLM traces.
```python
import litellm
import mlflow
from mlflow.entities import SpanType
# Enable LiteLLM tracing
mlflow.litellm.autolog()
class CustomAgent:
# Use @mlflow.trace to instrument Python functions.
@mlflow.trace(span_type=SpanType.AGENT)
def run(self, query: str):
# do something
while i < self.max_turns:
response = litellm.completion(
model="gpt-4o-mini",
messages=messages,
)
action = self.get_action(response)
...
@mlflow.trace
def get_action(llm_response):
...
```
This approach generates a unified trace, combining your custom Python code with LiteLLM calls.
## Support
* For advanced usage and integrations of tracing, visit the [MLflow Tracing documentation](https://mlflow.org/docs/latest/llms/tracing/index.html).
* For any question or issue with this integration, please [submit an issue](https://github.com/mlflow/mlflow/issues/new/choose) on our [Github](https://github.com/mlflow/mlflow) repository!

View file

@ -49,19 +49,9 @@ OTEL_ENDPOINT="http://0.0.0.0:4317"
</TabItem>
<TabItem value="laminar" label="Log to Laminar">
```shell
OTEL_EXPORTER="otlp_grpc"
OTEL_ENDPOINT="https://api.lmnr.ai:8443"
OTEL_HEADERS="authorization=Bearer <project-api-key>"
```
</TabItem>
</Tabs>
Use just 1 line of code, to instantly log your LLM responses **across all providers** with OpenTelemetry:
Use just 2 lines of code, to instantly log your LLM responses **across all providers** with OpenTelemetry:
```python
litellm.callbacks = ["otel"]
@ -86,20 +76,3 @@ Be aware that if you are continuing an existing trace, and you set `update_trace
## Support
For any question or issue with the integration you can reach out to the OpenLLMetry maintainers on [Slack](https://traceloop.com/slack) or via [email](mailto:dev@traceloop.com).
## Troubleshooting
### Trace LiteLLM Proxy user/key/org/team information on failed requests
LiteLLM emits the user_api_key_metadata
- key hash
- key_alias
- org_id
- user_id
- team_id
for successful + failed requests
click under `litellm_request` in the trace
<Image img={require('../../img/otel_debug_trace.png')} />

View file

@ -1,371 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Anthropic SDK
Pass-through endpoints for Anthropic - call provider-specific endpoint, in native format (no translation).
Just replace `https://api.anthropic.com` with `LITELLM_PROXY_BASE_URL/anthropic`
#### **Example Usage**
<Tabs>
<TabItem value="curl" label="curl">
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
</TabItem>
<TabItem value="python" label="Anthropic Python SDK">
```python
from anthropic import Anthropic
# Initialize client with proxy base URL
client = Anthropic(
base_url="http://0.0.0.0:4000/anthropic", # <proxy-base-url>/anthropic
api_key="sk-anything" # proxy virtual key
)
# Make a completion request
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{"role": "user", "content": "Hello, world"}
]
)
print(response)
```
</TabItem>
</Tabs>
Supports **ALL** Anthropic Endpoints (including streaming).
[**See All Anthropic Endpoints**](https://docs.anthropic.com/en/api/messages)
## Quick Start
Let's call the Anthropic [`/messages` endpoint](https://docs.anthropic.com/en/api/messages)
1. Add Anthropic API Key to your environment
```bash
export ANTHROPIC_API_KEY=""
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Anthropic /messages endpoint
```bash
curl http://0.0.0.0:4000/anthropic/v1/messages \
--header "x-api-key: $LITELLM_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "content-type: application/json" \
--data \
'{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
## Examples
Anything after `http://0.0.0.0:4000/anthropic` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://api.anthropic.com` | `http://0.0.0.0:4000/anthropic` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $ANTHROPIC_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Messages endpoint**
#### LiteLLM Proxy Call
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages \
--header "x-api-key: $LITELLM_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "content-type: application/json" \
--data '{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
#### Direct Anthropic API Call
```bash
curl https://api.anthropic.com/v1/messages \
--header "x-api-key: $ANTHROPIC_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "content-type: application/json" \
--data \
'{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
### **Example 2: Token Counting API**
#### LiteLLM Proxy Call
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages/count_tokens \
--header "x-api-key: $LITELLM_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "anthropic-beta: token-counting-2024-11-01" \
--header "content-type: application/json" \
--data \
'{
"model": "claude-3-5-sonnet-20241022",
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
#### Direct Anthropic API Call
```bash
curl https://api.anthropic.com/v1/messages/count_tokens \
--header "x-api-key: $ANTHROPIC_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "anthropic-beta: token-counting-2024-11-01" \
--header "content-type: application/json" \
--data \
'{
"model": "claude-3-5-sonnet-20241022",
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
### **Example 3: Batch Messages**
#### LiteLLM Proxy Call
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages/batches \
--header "x-api-key: $LITELLM_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "anthropic-beta: message-batches-2024-09-24" \
--header "content-type: application/json" \
--data \
'{
"requests": [
{
"custom_id": "my-first-request",
"params": {
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}
},
{
"custom_id": "my-second-request",
"params": {
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hi again, friend"}
]
}
}
]
}'
```
#### Direct Anthropic API Call
```bash
curl https://api.anthropic.com/v1/messages/batches \
--header "x-api-key: $ANTHROPIC_API_KEY" \
--header "anthropic-version: 2023-06-01" \
--header "anthropic-beta: message-batches-2024-09-24" \
--header "content-type: application/json" \
--data \
'{
"requests": [
{
"custom_id": "my-first-request",
"params": {
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}
},
{
"custom_id": "my-second-request",
"params": {
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hi again, friend"}
]
}
}
]
}'
```
## Advanced
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Anthropic API key, but still letting them use Anthropic endpoints.
### Use with Virtual Keys
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export COHERE_API_KEY=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-1234ewknldferwedojwojw" \
--data '{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
### Send `litellm_metadata` (tags)
<Tabs>
<TabItem value="curl" label="curl">
```bash
curl --request POST \
--url http://0.0.0.0:4000/anthropic/v1/messages \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"model": "claude-3-5-sonnet-20241022",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
],
"litellm_metadata": {
"tags": ["test-tag-1", "test-tag-2"]
}
}'
```
</TabItem>
<TabItem value="python" label="Anthropic Python SDK">
```python
from anthropic import Anthropic
client = Anthropic(
base_url="http://0.0.0.0:4000/anthropic",
api_key="sk-anything"
)
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{"role": "user", "content": "Hello, world"}
],
extra_body={
"litellm_metadata": {
"tags": ["test-tag-1", "test-tag-2"]
}
}
)
print(response)
```
</TabItem>
</Tabs>

View file

@ -1,4 +1,4 @@
# Cohere SDK
# Cohere API
Pass-through endpoints for Cohere - call provider-specific endpoint, in native format (no translation).

View file

@ -1,21 +1,12 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Google AI Studio SDK
# Google AI Studio
Pass-through endpoints for Google AI Studio - call provider-specific endpoint, in native format (no translation).
Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini`
Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` 🚀
#### **Example Usage**
<Tabs>
<TabItem value="curl" label="curl">
```bash
curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
-H 'Content-Type: application/json' \
-d '{
"contents": [{
@ -26,53 +17,6 @@ curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=
}'
```
</TabItem>
<TabItem value="js" label="Google AI Node.js SDK">
```javascript
const { GoogleGenerativeAI } = require("@google/generative-ai");
const modelParams = {
model: 'gemini-pro',
};
const requestOptions = {
baseUrl: 'http://localhost:4000/gemini', // http://<proxy-base-url>/gemini
};
const genAI = new GoogleGenerativeAI("sk-1234"); // litellm proxy API key
const model = genAI.getGenerativeModel(modelParams, requestOptions);
async function main() {
try {
const result = await model.generateContent("Explain how AI works");
console.log(result.response.text());
} catch (error) {
console.error('Error:', error);
}
}
// For streaming responses
async function main_streaming() {
try {
const streamingResult = await model.generateContentStream("Explain how AI works");
for await (const chunk of streamingResult.stream) {
console.log('Stream chunk:', JSON.stringify(chunk));
}
const aggregatedResponse = await streamingResult.response;
console.log('Aggregated response:', JSON.stringify(aggregatedResponse));
} catch (error) {
console.error('Error:', error);
}
}
main();
// main_streaming();
```
</TabItem>
</Tabs>
Supports **ALL** Google AI Studio Endpoints (including streaming).
[**See All Google AI Studio Endpoints**](https://ai.google.dev/api)
@ -222,14 +166,14 @@ curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5
```
## Advanced
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.
### Use with Virtual Keys
### Usage
1. Setup environment
@ -277,65 +221,3 @@ http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-123
}]
}'
```
### Send `tags` in request headers
Use this if you want `tags` to be tracked in the LiteLLM DB and on logging callbacks.
Pass tags in request headers as a comma separated list. In the example below the following tags will be tracked
```
tags: ["gemini-js-sdk", "pass-through-endpoint"]
```
<Tabs>
<TabItem value="curl" label="curl">
```bash
curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:generateContent?key=sk-anything' \
-H 'Content-Type: application/json' \
-H 'tags: gemini-js-sdk,pass-through-endpoint' \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}]
}]
}'
```
</TabItem>
<TabItem value="js" label="Google AI Node.js SDK">
```javascript
const { GoogleGenerativeAI } = require("@google/generative-ai");
const modelParams = {
model: 'gemini-pro',
};
const requestOptions = {
baseUrl: 'http://localhost:4000/gemini', // http://<proxy-base-url>/gemini
customHeaders: {
"tags": "gemini-js-sdk,pass-through-endpoint"
}
};
const genAI = new GoogleGenerativeAI("sk-1234");
const model = genAI.getGenerativeModel(modelParams, requestOptions);
async function main() {
try {
const result = await model.generateContent("Explain how AI works");
console.log(result.response.text());
} catch (error) {
console.error('Error:', error);
}
}
main();
```
</TabItem>
</Tabs>

View file

@ -1,4 +1,4 @@
# Langfuse SDK
# Langfuse Endpoints
Pass-through endpoints for Langfuse - call langfuse endpoints with LiteLLM Virtual Key.

File diff suppressed because it is too large Load diff

View file

@ -10,35 +10,6 @@ LiteLLM supports all anthropic models.
- `claude-2.1`
- `claude-instant-1.2`
| Property | Details |
|-------|-------|
| Description | Claude is a highly performant, trustworthy, and intelligent AI platform built by Anthropic. Claude excels at tasks involving language, reasoning, analysis, coding, and more. |
| Provider Route on LiteLLM | `anthropic/` (add this prefix to the model name, to route any requests to Anthropic - e.g. `anthropic/claude-3-5-sonnet-20240620`) |
| Provider Doc | [Anthropic ↗](https://docs.anthropic.com/en/docs/build-with-claude/overview) |
| API Endpoint for Provider | https://api.anthropic.com |
| Supported Endpoints | `/chat/completions` |
## Supported OpenAI Parameters
Check this in code, [here](../completion/input.md#translated-openai-params)
```
"stream",
"stop",
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"extra_headers",
"parallel_tool_calls",
"response_format",
"user"
```
:::info
Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed.
@ -893,145 +864,3 @@ Human: How do I boil water?
Assistant:
```
## Usage - PDF
Pass base64 encoded PDF files to Anthropic models using the `image_url` field.
<Tabs>
<TabItem value="sdk" label="SDK">
### **using base64**
```python
from litellm import completion, supports_pdf_input
import base64
import requests
# URL of the file
url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
# Download the file
response = requests.get(url)
file_data = response.content
encoded_file = base64.b64encode(file_data).decode("utf-8")
## check if model supports pdf input - (2024/11/11) only claude-3-5-haiku-20241022 supports it
supports_pdf_input("anthropic/claude-3-5-haiku-20241022") # True
response = completion(
model="anthropic/claude-3-5-haiku-20241022",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
{
"type": "image_url",
"image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
},
],
}
],
max_tokens=300,
)
print(response.choices[0])
```
</TabItem>
<TabItem value="proxy" lable="PROXY">
1. Add model to config
```yaml
- model_name: claude-3-5-haiku-20241022
litellm_params:
model: anthropic/claude-3-5-haiku-20241022
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-5-haiku-20241022",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "You are a very professional document summarization specialist. Please summarize the given document"
},
{
"type": "image_url",
"image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
}
}
]
}
],
"max_tokens": 300
}'
```
</TabItem>
</Tabs>
## Usage - passing 'user_id' to Anthropic
LiteLLM translates the OpenAI `user` param to Anthropic's `metadata[user_id]` param.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = completion(
model="claude-3-5-sonnet-20240620",
messages=messages,
user="user_123",
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-5-sonnet-20240620",
"messages": [{"role": "user", "content": "What is Anthropic?"}],
"user": "user_123"
}'
```
</TabItem>
</Tabs>

View file

@ -1082,6 +1082,5 @@ print(f"response: {response}")
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Stable Diffusion 3 - v0 | `embedding(model="bedrock/stability.stability.sd3-large-v1:0", prompt=prompt)` |
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |

View file

@ -10,7 +10,6 @@ import TabItem from '@theme/TabItem';
| Provider Route on LiteLLM | `gemini/` |
| Provider Doc | [Google AI Studio ↗](https://ai.google.dev/aistudio) |
| API Endpoint for Provider | https://generativelanguage.googleapis.com |
| Supported Endpoints | `/chat/completions`, `/embeddings` |
<br />

View file

@ -37,7 +37,7 @@ os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
# e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API
response = completion(
response = litellm.completion(
model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True
@ -165,14 +165,14 @@ Steps to use
```python
import os
from litellm import completion
import litellm
os.environ["HUGGINGFACE_API_KEY"] = ""
# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b
# add the 'huggingface/' prefix to the model to set huggingface as the provider
# set api base to your deployed api endpoint from hugging face
response = completion(
response = litellm.completion(
model="huggingface/glaiveai/glaive-coder-7b",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
@ -383,8 +383,6 @@ def default_pt(messages):
#### Custom prompt templates
```python
import litellm
# Create your own custom prompt template works
litellm.register_prompt_template(
model="togethercomputer/LLaMA-2-7B-32K",

View file

@ -1,13 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Jina AI
https://jina.ai/embeddings/
Supported endpoints:
- /embeddings
- /rerank
## API Key
```python
# env variable
@ -15,10 +8,6 @@ os.environ['JINA_AI_API_KEY']
```
## Sample Usage - Embedding
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import embedding
import os
@ -30,142 +19,6 @@ response = embedding(
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to config.yaml
```yaml
model_list:
- model_name: embedding-model
litellm_params:
model: jina_ai/jina-embeddings-v3
api_key: os.environ/JINA_AI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000/
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"input": ["hello world"], "model": "embedding-model"}'
```
</TabItem>
</Tabs>
## Sample Usage - Rerank
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import rerank
import os
os.environ["JINA_AI_API_KEY"] = "sk-..."
query = "What is the capital of the United States?"
documents = [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. is the capital of the United States.",
"Capital punishment has existed in the United States since before it was a country.",
]
response = rerank(
model="jina_ai/jina-reranker-v2-base-multilingual",
query=query,
documents=documents,
top_n=3,
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to config.yaml
```yaml
model_list:
- model_name: rerank-model
litellm_params:
model: jina_ai/jina-reranker-v2-base-multilingual
api_key: os.environ/JINA_AI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/rerank' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"model": "rerank-model",
"query": "What is the capital of the United States?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. is the capital of the United States.",
"Capital punishment has existed in the United States since before it was a country."
],
"top_n": 3
}'
```
</TabItem>
</Tabs>
## Supported Models
All models listed here https://jina.ai/embeddings/ are supported
## Supported Optional Rerank Parameters
All cohere rerank parameters are supported.
## Supported Optional Embeddings Parameters
```
dimensions
```
## Provider-specific parameters
Pass any jina ai specific parameters as a keyword argument to the `embedding` or `rerank` function, e.g.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = embedding(
model="jina_ai/jina-embeddings-v3",
input=["good morning from litellm"],
dimensions=1536,
my_custom_param="my_custom_value", # any other jina ai specific parameters
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"input": ["good morning from litellm"], "model": "jina_ai/jina-embeddings-v3", "dimensions": 1536, "my_custom_param": "my_custom_value"}'
```
</TabItem>
</Tabs>

View file

@ -572,96 +572,6 @@ Here's how to use Vertex AI with the LiteLLM Proxy Server
</Tabs>
## Authentication - vertex_project, vertex_location, etc.
Set your vertex credentials via:
- dynamic params
OR
- env vars
### **Dynamic Params**
You can set:
- `vertex_credentials` (str) - can be a json string or filepath to your vertex ai service account.json
- `vertex_location` (str) - place where vertex model is deployed (us-central1, asia-southeast1, etc.)
- `vertex_project` Optional[str] - use if vertex project different from the one in vertex_credentials
as dynamic params for a `litellm.completion` call.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
response = completion(
model="vertex_ai/gemini-pro",
messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}],
vertex_credentials=vertex_credentials_json,
vertex_project="my-special-project",
vertex_location="my-special-location"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: gemini-1.5-pro
litellm_params:
model: gemini-1.5-pro
vertex_credentials: os.environ/VERTEX_FILE_PATH_ENV_VAR # os.environ["VERTEX_FILE_PATH_ENV_VAR"] = "/path/to/service_account.json"
vertex_project: "my-special-project"
vertex_location: "my-special-location:
```
</TabItem>
</Tabs>
### **Environment Variables**
You can set:
- `GOOGLE_APPLICATION_CREDENTIALS` - store the filepath for your service_account.json in here (used by vertex sdk directly).
- VERTEXAI_LOCATION - place where vertex model is deployed (us-central1, asia-southeast1, etc.)
- VERTEXAI_PROJECT - Optional[str] - use if vertex project different from the one in vertex_credentials
1. GOOGLE_APPLICATION_CREDENTIALS
```bash
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service_account.json"
```
2. VERTEXAI_LOCATION
```bash
export VERTEXAI_LOCATION="us-central1" # can be any vertex location
```
3. VERTEXAI_PROJECT
```bash
export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is different from service account project
```
## Specifying Safety Settings
In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
@ -1251,96 +1161,12 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
## Model Garden
:::tip
All OpenAI compatible models from Vertex Model Garden are supported.
:::
| Model Name | Function Call |
|------------------|--------------------------------------|
| llama2 | `completion('vertex_ai/<endpoint_id>', messages)` |
#### Using Model Garden
**Almost all Vertex Model Garden models are OpenAI compatible.**
<Tabs>
<TabItem value="openai" label="OpenAI Compatible Models">
| Property | Details |
|----------|---------|
| Provider Route | `vertex_ai/openai/{MODEL_ID}` |
| Vertex Documentation | [Vertex Model Garden - OpenAI Chat Completions](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gradio_streaming_chat_completions.ipynb), [Vertex Model Garden](https://cloud.google.com/model-garden?hl=en) |
| Supported Operations | `/chat/completions`, `/embeddings` |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
## set ENV variables
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/openai/<your-endpoint-id>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: llama3-1-8b-instruct
litellm_params:
model: vertex_ai/openai/5464397967697903616
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "llama3-1-8b-instruct", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="non-openai" label="Non-OpenAI Compatible Models">
```python
from litellm import completion
import os
@ -1355,11 +1181,6 @@ response = completion(
)
```
</TabItem>
</Tabs>
## Gemini Pro
| Model Name | Function Call |
|------------------|--------------------------------------|
@ -1741,10 +1562,6 @@ curl http://0.0.0.0:4000/v1/chat/completions \
## **Embedding Models**
#### Usage - Embedding
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
from litellm import embedding
@ -1757,49 +1574,6 @@ response = embedding(
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: snowflake-arctic-embed-m-long-1731622468876
litellm_params:
model: vertex_ai/<your-model-id>
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: adroit-crow-413218-a956eef1a2a8.json
litellm_settings:
drop_params: True
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request using OpenAI Python SDK, Langchain Python SDK
```python
import openai
client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
response = client.embeddings.create(
model="snowflake-arctic-embed-m-long-1731622468876",
input = ["good morning from litellm", "this is another item"],
)
print(response)
```
</TabItem>
</Tabs>
#### Supported Embedding Models
All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported
@ -1815,7 +1589,6 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02
| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` |
| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` |
| Fine-tuned OR Custom Embedding models | `embedding(model="vertex_ai/<your-model-id>", input)` |
### Supported OpenAI (Unified) Params
@ -2393,6 +2166,97 @@ print("response from proxy", response)
</TabItem>
</Tabs>
## Authentication - vertex_project, vertex_location, etc.
Set your vertex credentials via:
- dynamic params
OR
- env vars
### **Dynamic Params**
You can set:
- `vertex_credentials` (str) - can be a json string or filepath to your vertex ai service account.json
- `vertex_location` (str) - place where vertex model is deployed (us-central1, asia-southeast1, etc.)
- `vertex_project` Optional[str] - use if vertex project different from the one in vertex_credentials
as dynamic params for a `litellm.completion` call.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
response = completion(
model="vertex_ai/gemini-pro",
messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}],
vertex_credentials=vertex_credentials_json,
vertex_project="my-special-project",
vertex_location="my-special-location"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: gemini-1.5-pro
litellm_params:
model: gemini-1.5-pro
vertex_credentials: os.environ/VERTEX_FILE_PATH_ENV_VAR # os.environ["VERTEX_FILE_PATH_ENV_VAR"] = "/path/to/service_account.json"
vertex_project: "my-special-project"
vertex_location: "my-special-location:
```
</TabItem>
</Tabs>
### **Environment Variables**
You can set:
- `GOOGLE_APPLICATION_CREDENTIALS` - store the filepath for your service_account.json in here (used by vertex sdk directly).
- VERTEXAI_LOCATION - place where vertex model is deployed (us-central1, asia-southeast1, etc.)
- VERTEXAI_PROJECT - Optional[str] - use if vertex project different from the one in vertex_credentials
1. GOOGLE_APPLICATION_CREDENTIALS
```bash
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service_account.json"
```
2. VERTEXAI_LOCATION
```bash
export VERTEXAI_LOCATION="us-central1" # can be any vertex location
```
3. VERTEXAI_PROJECT
```bash
export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is different from service account project
```
## Extra
### Using `GOOGLE_APPLICATION_CREDENTIALS`

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Alerting / Webhooks
# 🚨 Alerting / Webhooks
Get alerts for:

View file

@ -9,7 +9,7 @@ LiteLLM Supports Logging to the following Cloud Buckets
- (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
- (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
## Google Cloud Storage Buckets
## Logging Proxy Input/Output to Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
@ -20,14 +20,6 @@ Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?
:::
| Property | Details |
|----------|---------|
| Description | Log LLM Input/Output to cloud storage buckets |
| Load Test Benchmarks | [Benchmarks](https://docs.litellm.ai/docs/benchmarks) |
| Google Docs on Cloud Storage | [Google Cloud Storage](https://cloud.google.com/storage?hl=en) |
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
@ -93,7 +85,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## s3 Buckets
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set

View file

@ -136,7 +136,6 @@ litellm_settings:
type: "redis"
service_name: "mymaster"
sentinel_nodes: [["localhost", 26379]]
sentinel_password: "password" # [OPTIONAL]
```
</TabItem>
@ -150,7 +149,6 @@ You can configure redis sentinel in your .env by setting `REDIS_SENTINEL_NODES`
```env
REDIS_SENTINEL_NODES='[["localhost", 26379]]'
REDIS_SERVICE_NAME = "mymaster"
REDIS_SENTINEL_PASSWORD = "password"
```
:::note

View file

@ -1,59 +0,0 @@
# File Management
## `include` external YAML files in a config.yaml
You can use `include` to include external YAML files in a config.yaml.
**Quick Start Usage:**
To include a config file, use `include` with either a single file or a list of files.
Contents of `parent_config.yaml`:
```yaml
include:
- model_config.yaml # 👈 Key change, will include the contents of model_config.yaml
litellm_settings:
callbacks: ["prometheus"]
```
Contents of `model_config.yaml`:
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/gpt-4o
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: fake-anthropic-endpoint
litellm_params:
model: anthropic/fake
api_base: https://exampleanthropicendpoint-production.up.railway.app/
```
Start proxy server
This will start the proxy server with config `parent_config.yaml`. Since the `include` directive is used, the server will also include the contents of `model_config.yaml`.
```
litellm --config parent_config.yaml --detailed_debug
```
## Examples using `include`
Include a single file:
```yaml
include:
- model_config.yaml
```
Include multiple files:
```yaml
include:
- model_config.yaml
- another_config.yaml
```

View file

@ -1,507 +0,0 @@
# All settings
```yaml
environment_variables: {}
model_list:
- model_name: string
litellm_params: {}
model_info:
id: string
mode: embedding
input_cost_per_token: 0
output_cost_per_token: 0
max_tokens: 2048
base_model: gpt-4-1106-preview
additionalProp1: {}
litellm_settings:
# Logging/Callback settings
success_callback: ["langfuse"] # list of success callbacks
failure_callback: ["sentry"] # list of failure callbacks
callbacks: ["otel"] # list of callbacks - runs on success and failure
service_callbacks: ["datadog", "prometheus"] # logs redis, postgres failures on datadog, prometheus
turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged.
redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
# Networking settings
request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API
set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
json_logs: boolean # if true, logs will be in json format
# Fallbacks, reliability
default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad.
content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors
context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors
# Caching settings
cache: true
cache_params: # set cache params for redis
type: redis # type of cache to initialize
# Optional - Redis Settings
host: "localhost" # The host address for the Redis cache. Required if type is "redis".
port: 6379 # The port number for the Redis cache. Required if type is "redis".
password: "your_password" # The password for the Redis cache. Required if type is "redis".
namespace: "litellm.caching.caching" # namespace for redis cache
# Optional - Redis Cluster Settings
redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}]
# Optional - Redis Sentinel Settings
service_name: "mymaster"
sentinel_nodes: [["localhost", 26379]]
# Optional - Qdrant Semantic Cache Settings
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
similarity_threshold: 0.8 # similarity threshold for semantic cache
# Optional - S3 Cache Settings
s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket
# Common Cache settings
# Optional - Supported call types for caching
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
mode: default_off # if default_off, you need to opt in to caching on a per call basis
ttl: 600 # ttl for caching
callback_settings:
otel:
message_logging: boolean # OTEL logging callback specific settings
general_settings:
completion_model: string
disable_spend_logs: boolean # turn off writing each transaction to the db
disable_master_key_return: boolean # turn off returning master key on UI (checked on '/user/info' endpoint)
disable_retry_on_max_parallel_request_limit_error: boolean # turn off retries when max parallel request limit is reached
disable_reset_budget: boolean # turn off reset budget scheduled task
disable_adding_master_key_hash_to_db: boolean # turn off storing master key hash in db, for spend tracking
enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param
allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
key_management_system: google_kms # either google_kms or azure_kms
master_key: string
# Database Settings
database_url: string
database_connection_pool_limit: 0 # default 100
database_connection_timeout: 0 # default 60s
allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work
custom_auth: string
max_parallel_requests: 0 # the max parallel requests allowed per deployment
global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up
infer_model_from_keys: true
background_health_checks: true
health_check_interval: 300
alerting: ["slack", "email"]
alerting_threshold: 0
use_client_credentials_pass_through_routes: boolean # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints
```
### litellm_settings - Reference
| Name | Type | Description |
|------|------|-------------|
| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) |
| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) |
| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider |
| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.|
| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) |
| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) |
| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
| force_ipv4 | boolean | If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API |
| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
| cache | boolean | If true, enables caching. [Further docs](./caching) |
| cache_params | object | Parameters for the cache. [Further docs](./caching) |
| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) |
| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". |
| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". |
| cache_params.password | string | The password for the Redis cache. Required if type is "redis". |
| cache_params.namespace | string | The namespace for the Redis cache. |
| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) |
| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) |
| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) |
| cache_params.ttl | integer | The time (in seconds) to store entries in cache. |
| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. |
| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. |
| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. |
| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. |
| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. |
| cache_params.s3_region_name | string | The region name for the S3 bucket. |
| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. |
| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. |
| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. |
| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) |
| cache_params.mode | string | The mode of the cache. [Further docs](./caching) |
| disable_end_user_cost_tracking | boolean | If true, turns off end user cost tracking on prometheus metrics + litellm spend logs table on proxy. |
| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
### general_settings - Reference
| Name | Type | Description |
|------|------|-------------|
| completion_model | string | The default model to use for completions when `model` is not specified in the request |
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db |
| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) |
| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)|
| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)|
| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) |
| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) |
| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
| max_parallel_requests | integer | The max parallel requests allowed per deployment |
| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
| infer_model_from_keys | boolean | If true, infers the model from the provided keys |
| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) |
| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) |
| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) |
| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) |
| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) |
| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) |
| public_routes | List[str] | (Enterprise Feature) Control list of public routes |
| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] |
| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy |
| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address |
| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] |
| image_generation_model | str | The default model to use for image generation - ignores model set in request |
| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** |
| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. **Default is 605 seconds** |
| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 10 seconds** |
| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) |
| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) |
| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. |
| embedding_model | str | The default model to use for embeddings - ignores model set in request |
| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). |
| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) |
| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) |
| allow_user_auth | boolean | (Deprecated) old approach for user authentication. |
| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. |
| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB |
| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) |
| moderation_model | str | The default model to use for moderation. |
| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) |
| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) |
| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) |
| use_azure_key_vault | boolean | If true, load keys from azure key vault |
| use_google_kms | boolean | If true, load keys from google kms |
| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) |
| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) |
| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) |
| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) |
| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings |
| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |
### router_settings - Reference
:::info
Most values can also be set via `litellm_settings`. If you see overlapping values, settings on `router_settings` will override those on `litellm_settings`.
:::
```yaml
router_settings:
routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
redis_host: <your-redis-host> # string
redis_password: <your-redis-password> # string
redis_port: <your-redis-port> # string
enable_pre_call_check: true # bool - Before call is made check if a call is within model context window
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
disable_cooldowns: True # bool - Disable cooldowns for all models
enable_tag_filtering: True # bool - Use tag based routing for requests
retry_policy: { # Dict[str, int]: retry policy for different types of exceptions
"AuthenticationErrorRetries": 3,
"TimeoutErrorRetries": 3,
"RateLimitErrorRetries": 3,
"ContentPolicyViolationErrorRetries": 4,
"InternalServerErrorRetries": 4
}
allowed_fails_policy: {
"BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
"AuthenticationErrorAllowedFails": 10, # int
"TimeoutErrorAllowedFails": 12, # int
"RateLimitErrorAllowedFails": 10000, # int
"ContentPolicyViolationErrorAllowedFails": 15, # int
"InternalServerErrorAllowedFails": 20, # int
}
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
```
| Name | Type | Description |
|------|------|-------------|
| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) |
| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**|
| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) |
| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) |
| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) |
| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. |
| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) |
| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) |
| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) |
| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) |
| default_max_parallel_requests | Optional[int] | The default maximum number of parallel requests for a deployment. |
| default_priority | (Optional[int]) | The default priority for a request. Only for '.scheduler_acompletion()'. Default is None. |
| polling_interval | (Optional[float]) | frequency of polling queue. Only for '.scheduler_acompletion()'. Default is 3ms. |
| max_fallbacks | Optional[int] | The maximum number of fallbacks to try before exiting the call. Defaults to 5. |
| default_litellm_params | Optional[dict] | The default litellm parameters to add to all requests (e.g. `temperature`, `max_tokens`). |
| timeout | Optional[float] | The default timeout for a request. |
| debug_level | Literal["DEBUG", "INFO"] | The debug level for the logging library in the router. Defaults to "INFO". |
| client_ttl | int | Time-to-live for cached clients in seconds. Defaults to 3600. |
| cache_kwargs | dict | Additional keyword arguments for the cache initialization. |
| routing_strategy_args | dict | Additional keyword arguments for the routing strategy - e.g. lowest latency routing default ttl |
| model_group_alias | dict | Model group alias mapping. E.g. `{"claude-3-haiku": "claude-3-haiku-20240229"}` |
| num_retries | int | Number of retries for a request. Defaults to 3. |
| default_fallbacks | Optional[List[str]] | Fallbacks to try if no model group-specific fallbacks are defined. |
| caching_groups | Optional[List[tuple]] | List of model groups for caching across model groups. Defaults to None. - e.g. caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]|
| alerting_config | AlertingConfig | [SDK-only arg] Slack alerting configuration. Defaults to None. [Further Docs](../routing.md#alerting-) |
| assistants_config | AssistantsConfig | Set on proxy via `assistant_settings`. [Further docs](../assistants.md) |
| set_verbose | boolean | [DEPRECATED PARAM - see debug docs](./debugging.md) If true, sets the logging level to verbose. |
| retry_after | int | Time to wait before retrying a request in seconds. Defaults to 0. If `x-retry-after` is received from LLM API, this value is overridden. |
| provider_budget_config | ProviderBudgetConfig | Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None. [Further Docs](./provider_budget_routing.md) |
| enable_pre_call_checks | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
| model_group_retry_policy | Dict[str, RetryPolicy] | [SDK-only arg] Set retry policy for model groups. |
| context_window_fallbacks | List[Dict[str, List[str]]] | Fallback models for context window violations. |
| redis_url | str | URL for Redis server. **Known performance issue with Redis URL.** |
| cache_responses | boolean | Flag to enable caching LLM Responses, if cache set under `router_settings`. If true, caches responses. Defaults to False. |
| router_general_settings | RouterGeneralSettings | [SDK-Only] Router general settings - contains optimizations like 'async_only_mode'. [Docs](../routing.md#router-general-settings) |
### environment variables - Reference
| Name | Description |
|------|-------------|
| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
| AISPEND_ACCOUNT_ID | Account ID for AI Spend
| AISPEND_API_KEY | API Key for AI Spend
| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
| ARIZE_API_KEY | API key for Arize platform integration
| ARIZE_SPACE_KEY | Space key for Arize platform
| ARGILLA_BATCH_SIZE | Batch size for Argilla logging
| ARGILLA_API_KEY | API key for Argilla platform
| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging
| ARGILLA_DATASET_NAME | Dataset name for Argilla logging
| ARGILLA_BASE_URL | Base URL for Argilla service
| ATHINA_API_KEY | API key for Athina service
| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key)
| AWS_ACCESS_KEY_ID | Access Key ID for AWS services
| AWS_PROFILE_NAME | AWS CLI profile name to be used
| AWS_REGION_NAME | Default AWS region for service interactions
| AWS_ROLE_NAME | Role name for AWS IAM usage
| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services
| AWS_SESSION_NAME | Name for AWS session
| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS
| AZURE_API_VERSION | Version of the Azure API being used
| AZURE_AUTHORITY_HOST | Azure authority host URL
| AZURE_CLIENT_ID | Client ID for Azure services
| AZURE_CLIENT_SECRET | Client secret for Azure services
| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
| AZURE_KEY_VAULT_URI | URI for Azure Key Vault
| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
| BRAINTRUST_API_KEY | API key for Braintrust integration
| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
| CONFIG_FILE_PATH | File path for configuration file
| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
| DATABASE_HOST | Hostname for the database server
| DATABASE_NAME | Name of the database
| DATABASE_PASSWORD | Password for the database user
| DATABASE_PORT | Port number for database connection
| DATABASE_SCHEMA | Schema name used in the database
| DATABASE_URL | Connection URL for the database
| DATABASE_USER | Username for database connection
| DATABASE_USERNAME | Alias for database user
| DATABRICKS_API_BASE | Base URL for Databricks API
| DD_BASE_URL | Base URL for Datadog integration
| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
| DD_API_KEY | API key for Datadog integration
| DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
| DD_SOURCE | Source identifier for Datadog logs
| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
| DD_SERVICE | Service identifier for Datadog logs. Defaults to "litellm-server"
| DD_VERSION | Version identifier for Datadog logs. Defaults to "unknown"
| DEBUG_OTEL | Enable debug mode for OpenTelemetry
| DIRECT_URL | Direct URL for service endpoint
| DISABLE_ADMIN_UI | Toggle to disable the admin UI
| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates
| DOCS_DESCRIPTION | Description text for documentation pages
| DOCS_FILTERED | Flag indicating filtered documentation
| DOCS_TITLE | Title of the documentation pages
| DOCS_URL | The path to the Swagger API documentation. **By default this is "/"**
| EMAIL_SUPPORT_CONTACT | Support contact email address
| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket
| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file
| GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds**
| GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048**
| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers
| GENERIC_CLIENT_ID | Client ID for generic OAuth providers
| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers
| GENERIC_CLIENT_STATE | State parameter for generic client authentication
| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth
| GENERIC_SCOPE | Scope settings for generic OAuth providers
| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers
| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth
| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth
| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth
| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth
| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth
| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider
| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role
| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth
| GALILEO_BASE_URL | Base URL for Galileo platform
| GALILEO_PASSWORD | Password for Galileo authentication
| GALILEO_PROJECT_ID | Project ID for Galileo usage
| GALILEO_USERNAME | Username for Galileo authentication
| GREENSCALE_API_KEY | API key for Greenscale service
| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service
| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file
| GOOGLE_CLIENT_ID | Client ID for Google OAuth
| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth
| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS
| HF_API_BASE | Base URL for Hugging Face API
| HELICONE_API_KEY | API key for Helicone service
| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
| IAM_TOKEN_DB_AUTH | IAM token for database authentication
| JSON_LOGS | Enable JSON formatted logging
| JWT_AUDIENCE | Expected audience for JWT tokens
| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification
| LAGO_API_BASE | Base URL for Lago API
| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago
| LAGO_API_EVENT_CODE | Event code for Lago API events
| LAGO_API_KEY | API key for accessing Lago services
| LANGFUSE_DEBUG | Toggle debug mode for Langfuse
| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs
| LANGFUSE_HOST | Host URL for Langfuse service
| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication
| LANGFUSE_RELEASE | Release version of Langfuse integration
| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication
| LANGSMITH_API_KEY | API key for Langsmith platform
| LANGSMITH_BASE_URL | Base URL for Langsmith service
| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith
| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run
| LANGSMITH_PROJECT | Project name for Langsmith integration
| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging
| LANGTRACE_API_KEY | API key for Langtrace service
| LITERAL_API_KEY | API key for Literal integration
| LITERAL_API_URL | API URL for Literal service
| LITERAL_BATCH_SIZE | Batch size for Literal operations
| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
| LITELLM_EMAIL | Email associated with LiteLLM account
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM
| LITELLM_LICENSE | License key for LiteLLM usage
| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM
| LITELLM_LOG | Enable detailed logging for LiteLLM
| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development)
| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM
| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM
| LITELLM_TOKEN | Access token for LiteLLM integration
| LOGFIRE_TOKEN | Token for Logfire logging service
| MICROSOFT_CLIENT_ID | Client ID for Microsoft services
| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
| MICROSOFT_TENANT | Tenant ID for Microsoft Azure
| NO_DOCS | Flag to disable documentation generation
| NO_PROXY | List of addresses to bypass proxy
| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
| OPENAI_API_BASE | Base URL for OpenAI API
| OPENAI_API_KEY | API key for OpenAI services
| OPENAI_ORGANIZATION | Organization identifier for OpenAI
| OPENID_BASE_URL | Base URL for OpenID Connect services
| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication
| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication
| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration
| OPENMETER_API_KEY | API key for OpenMeter services
| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter
| OTEL_ENDPOINT | OpenTelemetry endpoint for traces
| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry
| OTEL_EXPORTER | Exporter type for OpenTelemetry
| OTEL_HEADERS | Headers for OpenTelemetry requests
| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
| PREDIBASE_API_BASE | Base URL for Predibase API
| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service
| PROMETHEUS_URL | URL for Prometheus service
| PROMPTLAYER_API_KEY | API key for PromptLayer integration
| PROXY_ADMIN_ID | Admin identifier for proxy server
| PROXY_BASE_URL | Base URL for proxy service
| PROXY_LOGOUT_URL | URL for logging out of the proxy service
| PROXY_MASTER_KEY | Master key for proxy authentication
| QDRANT_API_BASE | Base URL for Qdrant API
| QDRANT_API_KEY | API key for Qdrant service
| QDRANT_URL | Connection URL for Qdrant database
| REDIS_HOST | Hostname for Redis server
| REDIS_PASSWORD | Password for Redis service
| REDIS_PORT | Port number for Redis server
| REDOC_URL | The path to the Redoc Fast API documentation. **By default this is "/redoc"**
| SERVER_ROOT_PATH | Root path for the server application
| SET_VERBOSE | Flag to enable verbose logging
| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
| SLACK_WEBHOOK_URL | Webhook URL for Slack integration
| SMTP_HOST | Hostname for the SMTP server
| SMTP_PASSWORD | Password for SMTP authentication
| SMTP_PORT | Port number for SMTP server
| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
| SMTP_TLS | Flag to enable or disable TLS for SMTP connections
| SMTP_USERNAME | Username for SMTP authentication
| SPEND_LOGS_URL | URL for retrieving spend logs
| SSL_CERTIFICATE | Path to the SSL certificate file
| SSL_VERIFY | Flag to enable or disable SSL certificate verification
| SUPABASE_KEY | API key for Supabase service
| SUPABASE_URL | Base URL for Supabase instance
| TEST_EMAIL_ADDRESS | Email address used for testing purposes
| UI_LOGO_PATH | Path to the logo image used in the UI
| UI_PASSWORD | Password for accessing the UI
| UI_USERNAME | Username for accessing the UI
| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse
| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service
| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication
| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
| WEBHOOK_URL | URL for receiving webhooks from external services

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Overview
# Proxy Config.yaml
Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`master-key`) on the config.yaml.
| Param Name | Description |
@ -357,6 +357,77 @@ curl --location 'http://0.0.0.0:4000/v1/model/info' \
--data ''
```
### Provider specific wildcard routing
**Proxy all models from a provider**
Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
**Step 1** - define provider specific routing on config.yaml
```yaml
model_list:
# provider specific wildcard routing
- model_name: "anthropic/*"
litellm_params:
model: "anthropic/*"
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: "groq/*"
litellm_params:
model: "groq/*"
api_key: os.environ/GROQ_API_KEY
- model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
litellm_params:
model: "openai/fo::*:static::*"
api_key: os.environ/OPENAI_API_KEY
```
Step 2 - Run litellm proxy
```shell
$ litellm --config /path/to/config.yaml
```
Step 3 Test it
Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "anthropic/claude-3-sonnet-20240229",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "groq/llama3-8b-8192",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fo::hi::static::hi",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
### Load Balancing
:::info
@ -526,6 +597,472 @@ general_settings:
database_connection_timeout: 60 # sets a 60s timeout for any connection call to the db
```
## **All settings**
```yaml
environment_variables: {}
model_list:
- model_name: string
litellm_params: {}
model_info:
id: string
mode: embedding
input_cost_per_token: 0
output_cost_per_token: 0
max_tokens: 2048
base_model: gpt-4-1106-preview
additionalProp1: {}
litellm_settings:
# Logging/Callback settings
success_callback: ["langfuse"] # list of success callbacks
failure_callback: ["sentry"] # list of failure callbacks
callbacks: ["otel"] # list of callbacks - runs on success and failure
service_callbacks: ["datadog", "prometheus"] # logs redis, postgres failures on datadog, prometheus
turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged.
redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
json_logs: boolean # if true, logs will be in json format
# Fallbacks, reliability
default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad.
content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors
context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors
# Caching settings
cache: true
cache_params: # set cache params for redis
type: redis # type of cache to initialize
# Optional - Redis Settings
host: "localhost" # The host address for the Redis cache. Required if type is "redis".
port: 6379 # The port number for the Redis cache. Required if type is "redis".
password: "your_password" # The password for the Redis cache. Required if type is "redis".
namespace: "litellm.caching.caching" # namespace for redis cache
# Optional - Redis Cluster Settings
redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}]
# Optional - Redis Sentinel Settings
service_name: "mymaster"
sentinel_nodes: [["localhost", 26379]]
# Optional - Qdrant Semantic Cache Settings
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
similarity_threshold: 0.8 # similarity threshold for semantic cache
# Optional - S3 Cache Settings
s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket
# Common Cache settings
# Optional - Supported call types for caching
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
mode: default_off # if default_off, you need to opt in to caching on a per call basis
ttl: 600 # ttl for caching
callback_settings:
otel:
message_logging: boolean # OTEL logging callback specific settings
general_settings:
completion_model: string
disable_spend_logs: boolean # turn off writing each transaction to the db
disable_master_key_return: boolean # turn off returning master key on UI (checked on '/user/info' endpoint)
disable_retry_on_max_parallel_request_limit_error: boolean # turn off retries when max parallel request limit is reached
disable_reset_budget: boolean # turn off reset budget scheduled task
disable_adding_master_key_hash_to_db: boolean # turn off storing master key hash in db, for spend tracking
enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param
allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
key_management_system: google_kms # either google_kms or azure_kms
master_key: string
# Database Settings
database_url: string
database_connection_pool_limit: 0 # default 100
database_connection_timeout: 0 # default 60s
allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work
custom_auth: string
max_parallel_requests: 0 # the max parallel requests allowed per deployment
global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up
infer_model_from_keys: true
background_health_checks: true
health_check_interval: 300
alerting: ["slack", "email"]
alerting_threshold: 0
use_client_credentials_pass_through_routes: boolean # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints
```
### litellm_settings - Reference
| Name | Type | Description |
|------|------|-------------|
| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) |
| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) |
| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider |
| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.|
| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) |
| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) |
| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
| cache | boolean | If true, enables caching. [Further docs](./caching) |
| cache_params | object | Parameters for the cache. [Further docs](./caching) |
| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) |
| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". |
| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". |
| cache_params.password | string | The password for the Redis cache. Required if type is "redis". |
| cache_params.namespace | string | The namespace for the Redis cache. |
| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) |
| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) |
| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) |
| cache_params.ttl | integer | The time (in seconds) to store entries in cache. |
| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. |
| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. |
| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. |
| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. |
| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. |
| cache_params.s3_region_name | string | The region name for the S3 bucket. |
| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. |
| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. |
| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. |
| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) |
| cache_params.mode | string | The mode of the cache. [Further docs](./caching) |
### general_settings - Reference
| Name | Type | Description |
|------|------|-------------|
| completion_model | string | The default model to use for completions when `model` is not specified in the request |
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db |
| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) |
| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)|
| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)|
| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) |
| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) |
| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
| max_parallel_requests | integer | The max parallel requests allowed per deployment |
| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
| infer_model_from_keys | boolean | If true, infers the model from the provided keys |
| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) |
| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) |
| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) |
| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) |
| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) |
| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) |
| public_routes | List[str] | (Enterprise Feature) Control list of public routes |
| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] |
| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy |
| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address |
| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] |
| image_generation_model | str | The default model to use for image generation - ignores model set in request |
| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. |
| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. |
| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. |
| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) |
| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) |
| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. |
| embedding_model | str | The default model to use for embeddings - ignores model set in request |
| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). |
| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) |
| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) |
| allow_user_auth | boolean | (Deprecated) old approach for user authentication. |
| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. |
| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB |
| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) |
| moderation_model | str | The default model to use for moderation. |
| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) |
| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) |
| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) |
| use_azure_key_vault | boolean | If true, load keys from azure key vault |
| use_google_kms | boolean | If true, load keys from google kms |
| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) |
| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) |
| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) |
| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) |
| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings |
| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |
### router_settings - Reference
```yaml
router_settings:
routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
redis_host: <your-redis-host> # string
redis_password: <your-redis-password> # string
redis_port: <your-redis-port> # string
enable_pre_call_check: true # bool - Before call is made check if a call is within model context window
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
disable_cooldowns: True # bool - Disable cooldowns for all models
enable_tag_filtering: True # bool - Use tag based routing for requests
retry_policy: { # Dict[str, int]: retry policy for different types of exceptions
"AuthenticationErrorRetries": 3,
"TimeoutErrorRetries": 3,
"RateLimitErrorRetries": 3,
"ContentPolicyViolationErrorRetries": 4,
"InternalServerErrorRetries": 4
}
allowed_fails_policy: {
"BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
"AuthenticationErrorAllowedFails": 10, # int
"TimeoutErrorAllowedFails": 12, # int
"RateLimitErrorAllowedFails": 10000, # int
"ContentPolicyViolationErrorAllowedFails": 15, # int
"InternalServerErrorAllowedFails": 20, # int
}
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
```
| Name | Type | Description |
|------|------|-------------|
| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) |
| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**|
| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) |
| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) |
| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) |
| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. |
| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) |
| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) |
| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) |
| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) |
### environment variables - Reference
| Name | Description |
|------|-------------|
| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
| AISPEND_ACCOUNT_ID | Account ID for AI Spend
| AISPEND_API_KEY | API Key for AI Spend
| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
| ARIZE_API_KEY | API key for Arize platform integration
| ARIZE_SPACE_KEY | Space key for Arize platform
| ARGILLA_BATCH_SIZE | Batch size for Argilla logging
| ARGILLA_API_KEY | API key for Argilla platform
| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging
| ARGILLA_DATASET_NAME | Dataset name for Argilla logging
| ARGILLA_BASE_URL | Base URL for Argilla service
| ATHINA_API_KEY | API key for Athina service
| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key)
| AWS_ACCESS_KEY_ID | Access Key ID for AWS services
| AWS_PROFILE_NAME | AWS CLI profile name to be used
| AWS_REGION_NAME | Default AWS region for service interactions
| AWS_ROLE_NAME | Role name for AWS IAM usage
| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services
| AWS_SESSION_NAME | Name for AWS session
| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS
| AZURE_API_VERSION | Version of the Azure API being used
| AZURE_AUTHORITY_HOST | Azure authority host URL
| AZURE_CLIENT_ID | Client ID for Azure services
| AZURE_CLIENT_SECRET | Client secret for Azure services
| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
| AZURE_KEY_VAULT_URI | URI for Azure Key Vault
| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
| BRAINTRUST_API_KEY | API key for Braintrust integration
| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
| CONFIG_FILE_PATH | File path for configuration file
| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
| DATABASE_HOST | Hostname for the database server
| DATABASE_NAME | Name of the database
| DATABASE_PASSWORD | Password for the database user
| DATABASE_PORT | Port number for database connection
| DATABASE_SCHEMA | Schema name used in the database
| DATABASE_URL | Connection URL for the database
| DATABASE_USER | Username for database connection
| DATABASE_USERNAME | Alias for database user
| DATABRICKS_API_BASE | Base URL for Databricks API
| DD_BASE_URL | Base URL for Datadog integration
| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
| DD_API_KEY | API key for Datadog integration
| DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
| DD_SOURCE | Source identifier for Datadog logs
| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
| DEBUG_OTEL | Enable debug mode for OpenTelemetry
| DIRECT_URL | Direct URL for service endpoint
| DISABLE_ADMIN_UI | Toggle to disable the admin UI
| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates
| DOCS_DESCRIPTION | Description text for documentation pages
| DOCS_FILTERED | Flag indicating filtered documentation
| DOCS_TITLE | Title of the documentation pages
| EMAIL_SUPPORT_CONTACT | Support contact email address
| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket
| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file
| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers
| GENERIC_CLIENT_ID | Client ID for generic OAuth providers
| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers
| GENERIC_CLIENT_STATE | State parameter for generic client authentication
| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth
| GENERIC_SCOPE | Scope settings for generic OAuth providers
| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers
| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth
| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth
| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth
| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth
| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth
| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider
| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role
| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth
| GALILEO_BASE_URL | Base URL for Galileo platform
| GALILEO_PASSWORD | Password for Galileo authentication
| GALILEO_PROJECT_ID | Project ID for Galileo usage
| GALILEO_USERNAME | Username for Galileo authentication
| GREENSCALE_API_KEY | API key for Greenscale service
| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service
| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file
| GOOGLE_CLIENT_ID | Client ID for Google OAuth
| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth
| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS
| HF_API_BASE | Base URL for Hugging Face API
| HELICONE_API_KEY | API key for Helicone service
| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
| IAM_TOKEN_DB_AUTH | IAM token for database authentication
| JSON_LOGS | Enable JSON formatted logging
| JWT_AUDIENCE | Expected audience for JWT tokens
| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification
| LAGO_API_BASE | Base URL for Lago API
| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago
| LAGO_API_EVENT_CODE | Event code for Lago API events
| LAGO_API_KEY | API key for accessing Lago services
| LANGFUSE_DEBUG | Toggle debug mode for Langfuse
| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs
| LANGFUSE_HOST | Host URL for Langfuse service
| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication
| LANGFUSE_RELEASE | Release version of Langfuse integration
| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication
| LANGSMITH_API_KEY | API key for Langsmith platform
| LANGSMITH_BASE_URL | Base URL for Langsmith service
| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith
| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run
| LANGSMITH_PROJECT | Project name for Langsmith integration
| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging
| LANGTRACE_API_KEY | API key for Langtrace service
| LITERAL_API_KEY | API key for Literal integration
| LITERAL_API_URL | API URL for Literal service
| LITERAL_BATCH_SIZE | Batch size for Literal operations
| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
| LITELLM_EMAIL | Email associated with LiteLLM account
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM
| LITELLM_LICENSE | License key for LiteLLM usage
| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM
| LITELLM_LOG | Enable detailed logging for LiteLLM
| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development)
| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM
| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM
| LITELLM_TOKEN | Access token for LiteLLM integration
| LOGFIRE_TOKEN | Token for Logfire logging service
| MICROSOFT_CLIENT_ID | Client ID for Microsoft services
| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
| MICROSOFT_TENANT | Tenant ID for Microsoft Azure
| NO_DOCS | Flag to disable documentation generation
| NO_PROXY | List of addresses to bypass proxy
| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
| OPENAI_API_BASE | Base URL for OpenAI API
| OPENAI_API_KEY | API key for OpenAI services
| OPENAI_ORGANIZATION | Organization identifier for OpenAI
| OPENID_BASE_URL | Base URL for OpenID Connect services
| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication
| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication
| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration
| OPENMETER_API_KEY | API key for OpenMeter services
| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter
| OTEL_ENDPOINT | OpenTelemetry endpoint for traces
| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry
| OTEL_EXPORTER | Exporter type for OpenTelemetry
| OTEL_HEADERS | Headers for OpenTelemetry requests
| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
| PREDIBASE_API_BASE | Base URL for Predibase API
| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service
| PROMETHEUS_URL | URL for Prometheus service
| PROMPTLAYER_API_KEY | API key for PromptLayer integration
| PROXY_ADMIN_ID | Admin identifier for proxy server
| PROXY_BASE_URL | Base URL for proxy service
| PROXY_LOGOUT_URL | URL for logging out of the proxy service
| PROXY_MASTER_KEY | Master key for proxy authentication
| QDRANT_API_BASE | Base URL for Qdrant API
| QDRANT_API_KEY | API key for Qdrant service
| QDRANT_URL | Connection URL for Qdrant database
| REDIS_HOST | Hostname for Redis server
| REDIS_PASSWORD | Password for Redis service
| REDIS_PORT | Port number for Redis server
| SERVER_ROOT_PATH | Root path for the server application
| SET_VERBOSE | Flag to enable verbose logging
| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
| SLACK_WEBHOOK_URL | Webhook URL for Slack integration
| SMTP_HOST | Hostname for the SMTP server
| SMTP_PASSWORD | Password for SMTP authentication
| SMTP_PORT | Port number for SMTP server
| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
| SMTP_TLS | Flag to enable or disable TLS for SMTP connections
| SMTP_USERNAME | Username for SMTP authentication
| SPEND_LOGS_URL | URL for retrieving spend logs
| SSL_CERTIFICATE | Path to the SSL certificate file
| SSL_VERIFY | Flag to enable or disable SSL certificate verification
| SUPABASE_KEY | API key for Supabase service
| SUPABASE_URL | Base URL for Supabase instance
| TEST_EMAIL_ADDRESS | Email address used for testing purposes
| UI_LOGO_PATH | Path to the logo image used in the UI
| UI_PASSWORD | Password for accessing the UI
| UI_USERNAME | Username for accessing the UI
| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse
| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service
| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication
| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
| WEBHOOK_URL | URL for receiving webhooks from external services
## Extras

View file

@ -1,71 +0,0 @@
# What is stored in the DB
The LiteLLM Proxy uses a PostgreSQL database to store various information. Here's are the main features the DB is used for:
- Virtual Keys, Organizations, Teams, Users, Budgets, and more.
- Per request Usage Tracking
## Link to DB Schema
You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/main/schema.prisma)
## DB Tables
### Organizations, Teams, Users, End Users
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_OrganizationTable | Manages organization-level configurations. Tracks organization spend, model access, and metadata. Links to budget configurations and teams. | Low |
| LiteLLM_TeamTable | Handles team-level settings within organizations. Manages team members, admins, and their roles. Controls team-specific budgets, rate limits, and model access. | Low |
| LiteLLM_UserTable | Stores user information and their settings. Tracks individual user spend, model access, and rate limits. Manages user roles and team memberships. | Low |
| LiteLLM_EndUserTable | Manages end-user configurations. Controls model access and regional requirements. Tracks end-user spend. | Low |
| LiteLLM_TeamMembership | Tracks user participation in teams. Manages team-specific user budgets and spend. | Low |
| LiteLLM_OrganizationMembership | Manages user roles within organizations. Tracks organization-specific user permissions and spend. | Low |
| LiteLLM_InvitationLink | Handles user invitations. Manages invitation status and expiration. Tracks who created and accepted invitations. | Low |
| LiteLLM_UserNotifications | Handles model access requests. Tracks user requests for model access. Manages approval status. | Low |
### Authentication
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_VerificationToken | Manages Virtual Keys and their permissions. Controls token-specific budgets, rate limits, and model access. Tracks key-specific spend and metadata. | **Medium** - stores all Virtual Keys |
### Model (LLM) Management
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_ProxyModelTable | Stores model configurations. Defines available models and their parameters. Contains model-specific information and settings. | Low - Configuration only |
### Budget Management
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_BudgetTable | Stores budget and rate limit configurations for organizations, keys, and end users. Tracks max budgets, soft budgets, TPM/RPM limits, and model-specific budgets. Handles budget duration and reset timing. | Low - Configuration only |
### Tracking & Logging
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |
## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
```yaml
general_settings:
disable_spend_logs: True # Disable writing spend logs to DB
disable_error_logs: True # Disable writing error logs to DB
```
### What is the impact of disabling these logs?
When disabling spend logs (`disable_spend_logs: True`):
- You **will not** be able to view Usage on the LiteLLM UI
- You **will** continue seeing cost metrics on s3, Prometheus, Langfuse (any other Logging integration you are using)
When disabling error logs (`disable_error_logs: True`):
- You **will not** be able to view Errors on the LiteLLM UI
- You **will** continue seeing error logs in your application logs and any other logging integrations you are using

View file

@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import Image from '@theme/IdealImage';
# Docker, Deployment
# 🐳 Docker, Deployment
You can find the Dockerfile to build litellm proxy [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile)
@ -688,35 +688,8 @@ Provide an ssl certificate when starting litellm proxy server
Use this if you want to run the proxy with hypercorn to support http/2
Step 1. Build your custom docker image with hypercorn
```shell
# Use the provided base image
FROM ghcr.io/berriai/litellm:main-latest
# Set the working directory to /app
WORKDIR /app
# Copy the configuration file into the container at /app
COPY config.yaml .
# Make sure your docker/entrypoint.sh is executable
RUN chmod +x ./docker/entrypoint.sh
# Expose the necessary port
EXPOSE 4000/tcp
# 👉 Key Change: Install hypercorn
RUN pip install hypercorn
# Override the CMD instruction with your desired command and arguments
# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
# CMD ["--port", "4000", "--config", "config.yaml"]
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
```
Step 2. Pass the `--run_hypercorn` flag when starting the proxy
**Usage**
Pass the `--run_hypercorn` flag when starting the proxy
```shell
docker run \
@ -726,7 +699,7 @@ docker run \
-e SERVER_ROOT_PATH="/api/v1"\
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e LITELLM_MASTER_KEY="sk-1234"\
your_custom_docker_image \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml
--run_hypercorn
```

View file

@ -1,7 +1,3 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Getting Started - E2E Tutorial
End-to-End tutorial for LiteLLM Proxy to:
@ -13,11 +9,7 @@ End-to-End tutorial for LiteLLM Proxy to:
## Pre-Requisites
- Install LiteLLM Docker Image ** OR ** LiteLLM CLI (pip package)
<Tabs>
<TabItem value="docker" label="Docker">
- Install LiteLLM Docker Image
```
docker pull ghcr.io/berriai/litellm:main-latest
@ -25,18 +17,6 @@ docker pull ghcr.io/berriai/litellm:main-latest
[**See all docker images**](https://github.com/orgs/BerriAI/packages)
</TabItem>
<TabItem value="pip" label="LiteLLM CLI (pip package)">
```shell
$ pip install 'litellm[proxy]'
```
</TabItem>
</Tabs>
## 1. Add a model
Control LiteLLM Proxy with a config.yaml file.
@ -78,11 +58,6 @@ LiteLLM Proxy is 100% OpenAI-compatible. Test your azure model via the `/chat/co
Save your config.yaml from step 1. as `litellm_config.yaml`.
<Tabs>
<TabItem value="docker" label="Docker">
```bash
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
@ -95,20 +70,6 @@ docker run \
# RUNNING on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="pip" label="LiteLLM CLI (pip package)">
```shell
$ litellm --config /app/config.yaml --detailed_debug
```
</TabItem>
</Tabs>
Confirm your config.yaml got mounted correctly
```bash

View file

@ -1,5 +1,5 @@
# IP Address Filtering
# IP Address Filtering
:::info

View file

@ -1,4 +1,4 @@
# Proxy - Load Balancing
# Multiple Instances
Load balance multiple instances of the same model
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**

View file

@ -4,7 +4,6 @@ Log Proxy input, output, and exceptions using:
- Langfuse
- OpenTelemetry
- GCS and s3 Buckets
- Custom Callbacks
- Langsmith
- DataDog
@ -48,19 +47,7 @@ A number of these headers could be useful for troubleshooting, but the
`x-litellm-call-id` is the one that is most useful for tracking a request across
components in your system, including in logging tools.
## Logging Features
### Conditional Logging by Virtual Keys, Teams
Use this to:
1. Conditionally enable logging for some virtual keys/teams
2. Set different logging providers for different virtual keys/teams
[👉 **Get Started** - Team/Key Based Logging](team_logging)
### Redacting UserAPIKeyInfo
## Redacting UserAPIKeyInfo
Redact information about the user api key (hashed token, user_id, team id, etc.), from logs.
@ -72,58 +59,17 @@ litellm_settings:
redact_user_api_key_info: true
```
### Redact Messages, Response Content
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata will still be logged.
Example config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["langfuse"]
turn_off_message_logging: True # 👈 Key Change
```
If you have this feature turned on, you can override it for specific requests by
setting a request header `LiteLLM-Disable-Message-Redaction: true`.
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'LiteLLM-Disable-Message-Redaction: true' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Removes any field with `user_api_key_*` from metadata.
## What gets logged?
## What gets logged? StandardLoggingPayload
Found under `kwargs["standard_logging_object"]`. This is a standard payload, logged for every response.
```python
class StandardLoggingPayload(TypedDict):
id: str
trace_id: str # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries)
call_type: str
response_cost: float
response_cost_failure_debug_info: Optional[
StandardLoggingModelCostFailureDebugInformation
]
status: StandardLoggingPayloadStatus
total_tokens: int
prompt_tokens: int
completion_tokens: int
@ -138,13 +84,13 @@ class StandardLoggingPayload(TypedDict):
metadata: StandardLoggingMetadata
cache_hit: Optional[bool]
cache_key: Optional[str]
saved_cache_cost: float
saved_cache_cost: Optional[float]
request_tags: list
end_user: Optional[str]
requester_ip_address: Optional[str]
requester_ip_address: Optional[str] # IP address of requester
requester_metadata: Optional[dict] # metadata passed in request in the "metadata" field
messages: Optional[Union[str, list, dict]]
response: Optional[Union[str, list, dict]]
error_str: Optional[str]
model_parameters: dict
hidden_params: StandardLoggingHiddenParams
@ -153,51 +99,15 @@ class StandardLoggingHiddenParams(TypedDict):
cache_key: Optional[str]
api_base: Optional[str]
response_cost: Optional[str]
additional_headers: Optional[StandardLoggingAdditionalHeaders]
additional_headers: Optional[dict]
class StandardLoggingAdditionalHeaders(TypedDict, total=False):
x_ratelimit_limit_requests: int
x_ratelimit_limit_tokens: int
x_ratelimit_remaining_requests: int
x_ratelimit_remaining_tokens: int
class StandardLoggingMetadata(StandardLoggingUserAPIKeyMetadata):
"""
Specific metadata k,v pairs logged to integration for easier cost tracking
"""
spend_logs_metadata: Optional[
dict
] # special param to log k,v pairs to spendlogs for a call
requester_ip_address: Optional[str]
requester_metadata: Optional[dict]
class StandardLoggingModelInformation(TypedDict):
model_map_key: str
model_map_value: Optional[ModelInfo]
StandardLoggingPayloadStatus = Literal["success", "failure"]
class StandardLoggingModelCostFailureDebugInformation(TypedDict, total=False):
"""
Debug information, if cost tracking fails.
Avoid logging sensitive information like response or optional params
"""
error_str: Required[str]
traceback_str: Required[str]
model: str
cache_hit: Optional[bool]
custom_llm_provider: Optional[str]
base_model: Optional[str]
call_type: str
custom_pricing: Optional[bool]
```
## Langfuse
## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
@ -349,8 +259,73 @@ print(response)
</TabItem>
</Tabs>
### Team based Logging to Langfuse
### LiteLLM Tags - `cache_hit`, `cache_key`
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
<!--
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging. -->
### Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["langfuse"]
turn_off_message_logging: True
```
If you have this feature turned on, you can override it for specific requests by
setting a request header `LiteLLM-Disable-Message-Redaction: true`.
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'LiteLLM-Disable-Message-Redaction: true' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
### LiteLLM-specific Tags on Langfuse - `cache_hit`, `cache_key`
Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields
@ -385,7 +360,7 @@ litellm_settings:
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
```
### View POST sent from LiteLLM to provider
### 🔧 Debugging - Viewing RAW CURL sent from LiteLLM to provider
Use this when you want to view the RAW curl request sent from LiteLLM to the LLM API
@ -488,7 +463,7 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma
<Image img={require('../../img/debug_langfuse.png')} />
## OpenTelemetry
## Logging Proxy Input/Output in OpenTelemetry format
:::info
@ -770,7 +745,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
** 🎉 Expect to see this trace logged in your OTEL collector**
### Redacting Messages, Response Content
### Redacting Messages, Response Content from OTEL Logging
Set `message_logging=False` for `otel`, no messages / response will be logged
@ -784,8 +759,7 @@ callback_settings:
message_logging: False
```
### Traceparent Header
##### Context propagation across Services `Traceparent HTTP Header`
### Context propagation across Services `Traceparent HTTP Header`
❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**
@ -835,7 +809,7 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
<Image img={require('../../img/otel_parent.png')} />
##### Forwarding `Traceparent HTTP Header` to LLM APIs
### Forwarding `Traceparent HTTP Header` to LLM APIs
Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM
@ -852,151 +826,6 @@ litellm_settings:
forward_traceparent_to_llm_provider: True
```
## Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
| Property | Details |
|----------|---------|
| Description | Log LLM Input/Output to cloud storage buckets |
| Load Test Benchmarks | [Benchmarks](https://docs.litellm.ai/docs/benchmarks) |
| Google Docs on Cloud Storage | [Google Cloud Storage](https://cloud.google.com/storage?hl=en) |
#### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
#### Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
#### Fields Logged on GCS Buckets
[**The standard logging object is logged on GCS Bucket**](../proxy/logging)
#### Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket
## Custom Callback Class [Async]
Use this when you want to run custom callbacks in `python`
@ -1225,7 +1054,7 @@ class MyCustomHandler(CustomLogger):
{'mode': 'embedding', 'input_cost_per_token': 0.002}
```
##### Logging responses from proxy
### Logging responses from proxy
Both `/chat/completions` and `/embeddings` responses are available as `response_obj`
@ -1387,7 +1216,7 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Langsmith
## Logging LLM IO to Langsmith
1. Set `success_callback: ["langsmith"]` on litellm config.yaml
@ -1432,7 +1261,7 @@ Expect to see your log on Langfuse
<Image img={require('../../img/langsmith_new.png')} />
## Arize AI
## Logging LLM IO to Arize AI
1. Set `success_callback: ["arize"]` on litellm config.yaml
@ -1480,7 +1309,7 @@ Expect to see your log on Langfuse
<Image img={require('../../img/langsmith_new.png')} />
## Langtrace
## Logging LLM IO to Langtrace
1. Set `success_callback: ["langtrace"]` on litellm config.yaml
@ -1522,7 +1351,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
'
```
## Galileo
## Logging LLM IO to Galileo
[BETA]
@ -1543,7 +1372,7 @@ export GALILEO_USERNAME=""
export GALILEO_PASSWORD=""
```
#### Quick Start
### Quick Start
1. Add to Config.yaml
@ -1584,7 +1413,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
🎉 That's it - Expect to see your Logs on your Galileo Dashboard
## OpenMeter
## Logging Proxy Cost + Usage - OpenMeter
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
@ -1596,7 +1425,7 @@ export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
export OPENMETER_API_KEY=""
```
##### Quick Start
### Quick Start
1. Add to Config.yaml
@ -1637,7 +1466,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
<Image img={require('../../img/openmeter_img_2.png')} />
## DataDog
## Logging Proxy Input/Output - DataDog
LiteLLM Supports logging to the following Datdog Integrations:
- `datadog` [Datadog Logs](https://docs.datadoghq.com/logs/)
@ -1714,7 +1543,7 @@ Expected output on Datadog
<Image img={require('../../img/dd_small1.png')} />
## DynamoDB
## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set
@ -1840,7 +1669,7 @@ Your logs should be available on DynamoDB
}
```
## Sentry
## Logging Proxy Input/Output - Sentry
If api calls fail (llm/database) you can log those to Sentry:
@ -1882,7 +1711,7 @@ Test Request
litellm --test
```
## Athina
## Logging Proxy Input/Output Athina
[Athina](https://athina.ai/) allows you to log LLM Input/Output for monitoring, analytics, and observability.
@ -1929,10 +1758,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
<!-- ## (BETA) Moderation with Azure Content Safety
Note: This page is for logging callbacks and this is a moderation service. Commenting until we found a better location for this.
## (BETA) Moderation with Azure Content Safety
[Azure Content-Safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety) is a Microsoft Azure service that provides content moderation APIs to detect potential offensive, harmful, or risky content in text.
@ -2017,4 +1843,4 @@ litellm_settings:
:::info
`thresholds` are not required by default, but you can tune the values to your needs.
Default values is `4` for all categories
::: -->
:::

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage';
# Create Pass Through Endpoints
# ➡️ Create Pass Through Endpoints
Add pass through routes to LiteLLM Proxy

View file

@ -1,6 +1,5 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import Image from '@theme/IdealImage';
# ⚡ Best Practices for Production
@ -23,7 +22,6 @@ general_settings:
# OPTIONAL Best Practices
disable_spend_logs: True # turn off writing each transaction to the db. We recommend doing this is you don't need to see Usage on the LiteLLM UI and are tracking metrics via Prometheus
disable_error_logs: True # turn off writing LLM Exceptions to DB
allow_requests_on_db_unavailable: True # Only USE when running LiteLLM on your VPC. Allow requests to still be processed even if the DB is unavailable. We recommend doing this if you're running LiteLLM on VPC that cannot be accessed from the public internet.
litellm_settings:
@ -103,51 +101,18 @@ general_settings:
allow_requests_on_db_unavailable: True
```
## 6. Disable spend_logs & error_logs if not using the LiteLLM UI
## 6. Disable spend_logs if you're not using the LiteLLM UI
By default, LiteLLM writes several types of logs to the database:
- Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
By default LiteLLM will write every request to the `LiteLLM_SpendLogs` table. This is used for viewing Usage on the LiteLLM UI.
If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
If you're not viewing Usage on the LiteLLM UI (most users use Prometheus when this is disabled), you can disable spend_logs by setting `disable_spend_logs` to `True`.
```yaml
general_settings:
disable_spend_logs: True # Disable writing spend logs to DB
disable_error_logs: True # Disable writing error logs to DB
disable_spend_logs: True
```
[More information about what the Database is used for here](db_info)
## 7. Use Helm PreSync Hook for Database Migrations [BETA]
To ensure only one service manages database migrations, use our [Helm PreSync hook for Database Migrations](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/templates/migrations-job.yaml). This ensures migrations are handled during `helm upgrade` or `helm install`, while LiteLLM pods explicitly disable migrations.
1. **Helm PreSync Hook**:
- The Helm PreSync hook is configured in the chart to run database migrations during deployments.
- The hook always sets `DISABLE_SCHEMA_UPDATE=false`, ensuring migrations are executed reliably.
Reference Settings to set on ArgoCD for `values.yaml`
```yaml
db:
useExisting: true # use existing Postgres DB
url: postgresql://ishaanjaffer0324:3rnwpOBau6hT@ep-withered-mud-a5dkdpke.us-east-2.aws.neon.tech/test-argo-cd?sslmode=require # url of existing Postgres DB
```
2. **LiteLLM Pods**:
- Set `DISABLE_SCHEMA_UPDATE=true` in LiteLLM pod configurations to prevent them from running migrations.
Example configuration for LiteLLM pod:
```yaml
env:
- name: DISABLE_SCHEMA_UPDATE
value: "true"
```
## 8. Set LiteLLM Salt Key
## 7. Set LiteLLM Salt Key
If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB.

View file

@ -192,13 +192,3 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das
|----------------------|--------------------------------------|
| `litellm_llm_api_failed_requests_metric` | **deprecated** use `litellm_proxy_failed_requests_metric` |
| `litellm_requests_metric` | **deprecated** use `litellm_proxy_total_requests_metric` |
## FAQ
### What are `_created` vs. `_total` metrics?
- `_created` metrics are metrics that are created when the proxy starts
- `_total` metrics are metrics that are incremented for each request
You should consume the `_total` metrics for your counting purposes

View file

@ -1,191 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Provider Budget Routing
Use this to set budgets for LLM Providers - example $100/day for OpenAI, $100/day for Azure.
## Quick Start
Set provider budgets in your `proxy_config.yaml` file
### Proxy Config setup
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
router_settings:
provider_budget_config:
openai:
budget_limit: 0.000000000001 # float of $ value budget for time period
time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo
azure:
budget_limit: 100
time_period: 1d
anthropic:
budget_limit: 100
time_period: 10d
vertex_ai:
budget_limit: 100
time_period: 12d
gemini:
budget_limit: 100
time_period: 12d
# OPTIONAL: Set Redis Host, Port, and Password if using multiple instance of LiteLLM
redis_host: os.environ/REDIS_HOST
redis_port: os.environ/REDIS_PORT
redis_password: os.environ/REDIS_PASSWORD
general_settings:
master_key: sk-1234
```
### Make a test request
We expect the first request to succeed, and the second request to fail since we cross the budget for `openai`
**[Langchain, OpenAI SDK Usage Examples](../proxy/user_keys#request-format)**
<Tabs>
<TabItem label="Successful Call " value = "allowed">
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"messages": [
{"role": "user", "content": "hi my name is test request"}
]
}'
```
</TabItem>
<TabItem label="Unsuccessful call" value = "not-allowed">
Expect this to fail since since `ishaan@berri.ai` in the request is PII
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"messages": [
{"role": "user", "content": "hi my name is test request"}
]
}'
```
Expected response on failure
```json
{
"error": {
"message": "No deployments available - crossed budget for provider: Exceeded budget for provider openai: 0.0007350000000000001 >= 1e-12",
"type": "None",
"param": "None",
"code": "429"
}
}
```
</TabItem>
</Tabs>
## How provider budget routing works
1. **Budget Tracking**:
- Uses Redis to track spend for each provider
- Tracks spend over specified time periods (e.g., "1d", "30d")
- Automatically resets spend after time period expires
2. **Routing Logic**:
- Routes requests to providers under their budget limits
- Skips providers that have exceeded their budget
- If all providers exceed budget, raises an error
3. **Supported Time Periods**:
- Seconds: "Xs" (e.g., "30s")
- Minutes: "Xm" (e.g., "10m")
- Hours: "Xh" (e.g., "24h")
- Days: "Xd" (e.g., "1d", "30d")
- Months: "Xmo" (e.g., "1mo", "2mo")
4. **Requirements**:
- Redis required for tracking spend across instances
- Provider names must be litellm provider names. See [Supported Providers](https://docs.litellm.ai/docs/providers)
## Monitoring Provider Remaining Budget
LiteLLM will emit the following metric on Prometheus to track the remaining budget for each provider
This metric indicates the remaining budget for a provider in dollars (USD)
```
litellm_provider_remaining_budget_metric{api_provider="openai"} 10
```
## Multi-instance setup
If you are using a multi-instance setup, you will need to set the Redis host, port, and password in the `proxy_config.yaml` file. Redis is used to sync the spend across LiteLLM instances.
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
router_settings:
provider_budget_config:
openai:
budget_limit: 0.000000000001 # float of $ value budget for time period
time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo
# 👇 Add this: Set Redis Host, Port, and Password if using multiple instance of LiteLLM
redis_host: os.environ/REDIS_HOST
redis_port: os.environ/REDIS_PORT
redis_password: os.environ/REDIS_PASSWORD
general_settings:
master_key: sk-1234
```
## Spec for provider_budget_config
The `provider_budget_config` is a dictionary where:
- **Key**: Provider name (string) - Must be a valid [LiteLLM provider name](https://docs.litellm.ai/docs/providers)
- **Value**: Budget configuration object with the following parameters:
- `budget_limit`: Float value representing the budget in USD
- `time_period`: Duration string in one of the following formats:
- Seconds: `"Xs"` (e.g., "30s")
- Minutes: `"Xm"` (e.g., "10m")
- Hours: `"Xh"` (e.g., "24h")
- Days: `"Xd"` (e.g., "1d", "30d")
- Months: `"Xmo"` (e.g., "1mo", "2mo")
Example structure:
```yaml
provider_budget_config:
openai:
budget_limit: 100.0 # $100 USD
time_period: "1d" # 1 day period
azure:
budget_limit: 500.0 # $500 USD
time_period: "30d" # 30 day period
anthropic:
budget_limit: 200.0 # $200 USD
time_period: "1mo" # 1 month period
gemini:
budget_limit: 50.0 # $50 USD
time_period: "24h" # 24 hour period
```

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Proxy - Fallbacks, Retries
# Fallbacks, Load Balancing, Retries
- Quick Start [load balancing](#test---load-balancing)
- Quick Start [client side fallbacks](#test---client-side-fallbacks)
@ -749,18 +749,3 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
"mock_testing_fallbacks": true
}'
```
### Disable Fallbacks per key
You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.
```bash
curl -L -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"metadata": {
"disable_fallbacks": true
}
}'
```

View file

@ -217,10 +217,4 @@ litellm_settings:
max_parallel_requests: 1000 # (Optional[int], optional): Max number of requests that can be made in parallel. Defaults to None.
tpm_limit: 1000 #(Optional[int], optional): Tpm limit. Defaults to None.
rpm_limit: 1000 #(Optional[int], optional): Rpm limit. Defaults to None.
key_generation_settings: # Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation)
team_key_generation:
allowed_team_member_roles: ["admin"]
personal_key_generation: # maps to 'Default Team' on UI
allowed_user_roles: ["proxy_admin"]
```

View file

@ -1,4 +1,4 @@
# Team-based Routing
# 👥 Team-based Routing
## Routing
Route calls to different model groups based on the team-id

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Team/Key Based Logging
# 👥📊 Team/Key Based Logging
Allow each key/team to use their own Langfuse Project / custom callbacks
@ -11,13 +11,15 @@ Allow each key/team to use their own Langfuse Project / custom callbacks
Team 1 -> Logs to Langfuse Project 1
Team 2 -> Logs to Langfuse Project 2
Team 3 -> Disabled Logging (for GDPR compliance)
```
## Team Based Logging
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
### Setting Team Logging via `config.yaml`
## Logging / Caching
Turn on/off logging and caching for a specific team id.
@ -279,51 +281,6 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
}'
```
</TabItem>
<TabItem label="Langsmith" value="langsmith">
1. Create Virtual Key to log to a specific Langsmith Project
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"metadata": {
"logging": [{
"callback_name": "langsmith", # "otel", "gcs_bucket"
"callback_type": "success", # "success", "failure", "success_and_failure"
"callback_vars": {
"langsmith_api_key": "os.environ/LANGSMITH_API_KEY", # API Key for Langsmith logging
"langsmith_project": "pr-brief-resemblance-72", # project name on langsmith
"langsmith_base_url": "https://api.smith.langchain.com"
}
}]
}
}'
```
2. Test it - `/chat/completions` request
Use the virtual key from step 3 to make a `/chat/completions` request
You should see your logs on your Langsmith project on a successful request
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-Fxq5XSyWKeXDKfPdqXZhPg" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello, Claude"}
],
"user": "hello",
}'
```
</TabItem>
</Tabs>

View file

@ -64,7 +64,7 @@ Allow others to create/delete their own keys.
Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)
### SSO for UI
### Setup SSO/Auth for UI
#### Step 1: Set upperbounds for keys
Control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
@ -88,6 +88,12 @@ litellm_settings:
#### Step 2: Setup Oauth Client
:::tip
Looking for how to use Oauth 2.0 for /chat, /completions API requests to the proxy? [Follow this doc](oauth2)
:::
<Tabs>
<TabItem value="okta" label="Okta SSO">
@ -190,13 +196,6 @@ GENERIC_SCOPE = "openid profile email" # default scope openid is sometimes not e
</Tabs>
### Default Login, Logout URLs
Some SSO providers require a specific redirect url for login and logout. You can input the following values.
- Login: `<your-proxy-base-url>/sso/key/generate`
- Logout: `<your-proxy-base-url>`
#### Step 3. Set `PROXY_BASE_URL` in your .env
Set this in your .env (so the proxy can set the correct redirect url)
@ -217,9 +216,9 @@ export ALLOWED_EMAIL_DOMAINS="berri.ai"
This will check if the user email we receive from SSO contains this domain, before allowing access.
### Set Proxy Admin
### Set Admin view w/ SSO
Set a Proxy Admin when SSO is enabled. Once SSO is enabled, the `user_id` for users is retrieved from the SSO provider. In order to set a Proxy Admin, you need to copy the `user_id` from the UI and set it in your `.env` as `PROXY_ADMIN_ID`.
You just need to set Proxy Admin ID
#### Step 1: Copy your ID from the UI
@ -257,7 +256,7 @@ general_settings:
default_team_disabled: true # OR you can set env var PROXY_DEFAULT_TEAM_DISABLED="true"
```
### Use Username, Password when SSO is on
### Sign in with Username, Password when SSO is on
If you need to access the UI via username/password when SSO is on navigate to `/fallback/login`. This route will allow you to sign in with your username/password credentials.

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Virtual Keys
# 🔑 Virtual Keys
Track Spend, and control model access via virtual keys for the proxy
:::info
@ -811,78 +811,6 @@ litellm_settings:
team_id: "core-infra"
```
### Restricting Key Generation
Use this to control who can generate keys. Useful when letting others create keys on the UI.
```yaml
litellm_settings:
key_generation_settings:
team_key_generation:
allowed_team_member_roles: ["admin"]
required_params: ["tags"] # require team admins to set tags for cost-tracking when generating a team key
personal_key_generation: # maps to 'Default Team' on UI
allowed_user_roles: ["proxy_admin"]
```
#### Spec
```python
class TeamUIKeyGenerationConfig(TypedDict):
allowed_team_member_roles: List[str]
required_params: List[str] # require params on `/key/generate` to be set if a team key (team_id in request) is being generated
class PersonalUIKeyGenerationConfig(TypedDict):
allowed_user_roles: List[LitellmUserRoles]
required_params: List[str] # require params on `/key/generate` to be set if a personal key (no team_id in request) is being generated
class StandardKeyGenerationConfig(TypedDict, total=False):
team_key_generation: TeamUIKeyGenerationConfig
personal_key_generation: PersonalUIKeyGenerationConfig
class LitellmUserRoles(str, enum.Enum):
"""
Admin Roles:
PROXY_ADMIN: admin over the platform
PROXY_ADMIN_VIEW_ONLY: can login, view all own keys, view all spend
ORG_ADMIN: admin over a specific organization, can create teams, users only within their organization
Internal User Roles:
INTERNAL_USER: can login, view/create/delete their own keys, view their spend
INTERNAL_USER_VIEW_ONLY: can login, view their own keys, view their own spend
Team Roles:
TEAM: used for JWT auth
Customer Roles:
CUSTOMER: External users -> these are customers
"""
# Admin Roles
PROXY_ADMIN = "proxy_admin"
PROXY_ADMIN_VIEW_ONLY = "proxy_admin_viewer"
# Organization admins
ORG_ADMIN = "org_admin"
# Internal User Roles
INTERNAL_USER = "internal_user"
INTERNAL_USER_VIEW_ONLY = "internal_user_viewer"
# Team Roles
TEAM = "team"
# Customer Roles - External users of proxy
CUSTOMER = "customer"
```
## **Next Steps - Set Budgets, Rate Limits per Virtual Key**
[Follow this doc to set budgets, rate limiters per virtual key with LiteLLM](users)

View file

@ -114,4 +114,3 @@ curl http://0.0.0.0:4000/rerank \
| Cohere | [Usage](#quick-start) |
| Together AI| [Usage](../docs/providers/togetherai) |
| Azure AI| [Usage](../docs/providers/azure_ai) |
| Jina AI| [Usage](../docs/providers/jina_ai) |

View file

@ -1,24 +0,0 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Router Architecture (Fallbacks / Retries)
## High Level architecture
<Image img={require('../img/router_architecture.png')} style={{ width: '100%', maxWidth: '4000px' }} />
### Request Flow
1. **User Sends Request**: The process begins when a user sends a request to the LiteLLM Router endpoint. All unified endpoints (`.completion`, `.embeddings`, etc) are supported by LiteLLM Router.
2. **function_with_fallbacks**: The initial request is sent to the `function_with_fallbacks` function. This function wraps the initial request in a try-except block, to handle any exceptions - doing fallbacks if needed. This request is then sent to the `function_with_retries` function.
3. **function_with_retries**: The `function_with_retries` function wraps the request in a try-except block and passes the initial request to a base litellm unified function (`litellm.completion`, `litellm.embeddings`, etc) to handle LLM API calling. `function_with_retries` handles any exceptions - doing retries on the model group if needed (i.e. if the request fails, it will retry on an available model within the model group).
4. **litellm.completion**: The `litellm.completion` function is a base function that handles the LLM API calling. It is used by `function_with_retries` to make the actual request to the LLM API.
## Legend
**model_group**: A group of LLM API deployments that share the same `model_name`, are part of the same `model_group`, and can be load balanced across.

View file

@ -281,7 +281,7 @@ Picks the deployment with the lowest response time.
It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
[**How to test**](https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_lowest_latency_routing.py)
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
```python
from litellm import Router
@ -567,7 +567,7 @@ print(response)
Picks a deployment with the least number of ongoing calls, it's handling.
[**How to test**](https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_least_busy_routing.py)
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
```python
from litellm import Router
@ -1035,7 +1035,7 @@ print(f"response: {response}")
### [Advanced]: Custom Retries, Cooldowns based on Error Type
- Use `RetryPolicy` if you want to set a `num_retries` based on the Exception received
- Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved
- Use `AllowedFailsPolicy` to set a custom number of `allowed_fails`/minute before cooling down a deployment
[**See All Exception Types**](https://github.com/BerriAI/litellm/blob/ccda616f2f881375d4e8586c76fe4662909a7d22/litellm/types/router.py#L436)
@ -1891,22 +1891,3 @@ router = Router(
debug_level="DEBUG" # defaults to INFO
)
```
## Router General Settings
### Usage
```python
router = Router(model_list=..., router_general_settings=RouterGeneralSettings(async_only_mode=True))
```
### Spec
```python
class RouterGeneralSettings(BaseModel):
async_only_mode: bool = Field(
default=False
) # this will only initialize async clients. Good for memory utils
pass_through_all_models: bool = Field(
default=False
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
```

View file

@ -1,6 +1,3 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Secret Manager
LiteLLM supports reading secrets from Azure Key Vault, Google Secret Manager
@ -62,36 +59,14 @@ os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
```
2. Enable AWS Secret Manager in config.
<Tabs>
<TabItem value="read_only" label="Read Keys from AWS Secret Manager">
```yaml
general_settings:
master_key: os.environ/litellm_master_key
key_management_system: "aws_secret_manager" # 👈 KEY CHANGE
key_management_settings:
hosted_keys: ["litellm_master_key"] # 👈 Specify which env keys you stored on AWS
```
</TabItem>
<TabItem value="write_only" label="Write Virtual Keys to AWS Secret Manager">
This will only store virtual keys in AWS Secret Manager. No keys will be read from AWS Secret Manager.
```yaml
general_settings:
key_management_system: "aws_secret_manager" # 👈 KEY CHANGE
key_management_settings:
store_virtual_keys: true # OPTIONAL. Defaults to False, when True will store virtual keys in secret manager
prefix_for_stored_virtual_keys: "litellm/" # OPTIONAL. If set, this prefix will be used for stored virtual keys in the secret manager
access_mode: "write_only" # Literal["read_only", "write_only", "read_and_write"]
```
</TabItem>
</Tabs>
3. Run proxy
```bash
@ -206,14 +181,16 @@ litellm --config /path/to/config.yaml
Use encrypted keys from Google KMS on the proxy
Step 1. Add keys to env
### Usage with LiteLLM Proxy Server
## Step 1. Add keys to env
```
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/credentials.json"
export GOOGLE_KMS_RESOURCE_NAME="projects/*/locations/*/keyRings/*/cryptoKeys/*"
export PROXY_DATABASE_URL_ENCRYPTED=b'\n$\x00D\xac\xb4/\x8e\xc...'
```
Step 2: Update Config
## Step 2: Update Config
```yaml
general_settings:
@ -222,7 +199,7 @@ general_settings:
master_key: sk-1234
```
Step 3: Start + test proxy
## Step 3: Start + test proxy
```
$ litellm --config /path/to/config.yaml
@ -238,24 +215,3 @@ $ litellm --test
<!--
## .env Files
If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data. -->
## All Secret Manager Settings
All settings related to secret management
```yaml
general_settings:
key_management_system: "aws_secret_manager" # REQUIRED
key_management_settings:
# Storing Virtual Keys Settings
store_virtual_keys: true # OPTIONAL. Defaults to False, when True will store virtual keys in secret manager
prefix_for_stored_virtual_keys: "litellm/" # OPTIONAL.I f set, this prefix will be used for stored virtual keys in the secret manager
# Access Mode Settings
access_mode: "write_only" # OPTIONAL. Literal["read_only", "write_only", "read_and_write"]. Defaults to "read_only"
# Hosted Keys Settings
hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
```

View file

@ -1,174 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Text Completion
### Usage
<Tabs>
<TabItem value="python" label="LiteLLM Python SDK">
```python
from litellm import text_completion
response = text_completion(
model="gpt-3.5-turbo-instruct",
prompt="Say this is a test",
max_tokens=7
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy Server">
1. Define models on config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo-instruct
litellm_params:
model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
api_key: os.environ/OPENAI_API_KEY
- model_name: text-davinci-003
litellm_params:
model: text-completion-openai/text-davinci-003
api_key: os.environ/OPENAI_API_KEY
```
2. Start litellm proxy server
```
litellm --config config.yaml
```
<Tabs>
<TabItem value="python" label="OpenAI Python SDK">
```python
from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.completions.create(
model="gpt-3.5-turbo-instruct",
prompt="Say this is a test",
max_tokens=7
)
print(response)
```
</TabItem>
<TabItem value="curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "gpt-3.5-turbo-instruct",
"prompt": "Say this is a test",
"max_tokens": 7
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
## Input Params
LiteLLM accepts and translates the [OpenAI Text Completion params](https://platform.openai.com/docs/api-reference/completions) across all supported providers.
### Required Fields
- `model`: *string* - ID of the model to use
- `prompt`: *string or array* - The prompt(s) to generate completions for
### Optional Fields
- `best_of`: *integer* - Generates best_of completions server-side and returns the "best" one
- `echo`: *boolean* - Echo back the prompt in addition to the completion.
- `frequency_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency.
- `logit_bias`: *map* - Modify the likelihood of specified tokens appearing in the completion
- `logprobs`: *integer* - Include the log probabilities on the logprobs most likely tokens. Max value of 5
- `max_tokens`: *integer* - The maximum number of tokens to generate.
- `n`: *integer* - How many completions to generate for each prompt.
- `presence_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
- `seed`: *integer* - If specified, system will attempt to make deterministic samples
- `stop`: *string or array* - Up to 4 sequences where the API will stop generating tokens
- `stream`: *boolean* - Whether to stream back partial progress. Defaults to false
- `suffix`: *string* - The suffix that comes after a completion of inserted text
- `temperature`: *number* - What sampling temperature to use, between 0 and 2.
- `top_p`: *number* - An alternative to sampling with temperature, called nucleus sampling.
- `user`: *string* - A unique identifier representing your end-user
## Output Format
Here's the exact JSON output format you can expect from completion calls:
[**Follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/completions/object)
<Tabs>
<TabItem value="non-streaming" label="Non-Streaming Response">
```python
{
"id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
"object": "text_completion",
"created": 1589478378,
"model": "gpt-3.5-turbo-instruct",
"system_fingerprint": "fp_44709d6fcb",
"choices": [
{
"text": "\n\nThis is indeed a test",
"index": 0,
"logprobs": null,
"finish_reason": "length"
}
],
"usage": {
"prompt_tokens": 5,
"completion_tokens": 7,
"total_tokens": 12
}
}
```
</TabItem>
<TabItem value="streaming" label="Streaming Response">
```python
{
"id": "cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe",
"object": "text_completion",
"created": 1690759702,
"choices": [
{
"text": "This",
"index": 0,
"logprobs": null,
"finish_reason": null
}
],
"model": "gpt-3.5-turbo-instruct"
"system_fingerprint": "fp_44709d6fcb",
}
```
</TabItem>
</Tabs>
## **Supported Providers**
| Provider | Link to Usage |
|-------------|--------------------|
| OpenAI | [Usage](../docs/providers/text_completion_openai) |
| Azure OpenAI| [Usage](../docs/providers/azure) |

View file

@ -1,140 +0,0 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Provider specific Wildcard routing
**Proxy all models from a provider**
Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
## Step 1. Define provider specific routing
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
router = Router(
model_list=[
{
"model_name": "anthropic/*",
"litellm_params": {
"model": "anthropic/*",
"api_key": os.environ["ANTHROPIC_API_KEY"]
}
},
{
"model_name": "groq/*",
"litellm_params": {
"model": "groq/*",
"api_key": os.environ["GROQ_API_KEY"]
}
},
{
"model_name": "fo::*:static::*", # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
"litellm_params": {
"model": "openai/fo::*:static::*",
"api_key": os.environ["OPENAI_API_KEY"]
}
}
]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Step 1** - define provider specific routing on config.yaml
```yaml
model_list:
# provider specific wildcard routing
- model_name: "anthropic/*"
litellm_params:
model: "anthropic/*"
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: "groq/*"
litellm_params:
model: "groq/*"
api_key: os.environ/GROQ_API_KEY
- model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
litellm_params:
model: "openai/fo::*:static::*"
api_key: os.environ/OPENAI_API_KEY
```
</TabItem>
</Tabs>
## [PROXY-Only] Step 2 - Run litellm proxy
```shell
$ litellm --config /path/to/config.yaml
```
## Step 3 - Test it
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
router = Router(model_list=...)
# Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
resp = completion(model="anthropic/claude-3-sonnet-20240229", messages=[{"role": "user", "content": "Hello, Claude!"}])
print(resp)
# Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
resp = completion(model="groq/llama3-8b-8192", messages=[{"role": "user", "content": "Hello, Groq!"}])
print(resp)
# Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
resp = completion(model="fo::hi::static::hi", messages=[{"role": "user", "content": "Hello, Claude!"}])
print(resp)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
```bash
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "anthropic/claude-3-sonnet-20240229",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "groq/llama3-8b-8192",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fo::hi::static::hi",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
</TabItem>
</Tabs>

View file

@ -113,7 +113,7 @@ const config = {
{
sidebarId: 'tutorialSidebar',
position: 'left',
label: 'Hosted',
label: '🚀 Hosted',
to: "docs/hosted"
},
{

Binary file not shown.

Before

Width:  |  Height:  |  Size: 361 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 437 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

View file

@ -20,70 +20,59 @@ const sidebars = {
{ type: "doc", id: "index" }, // NEW
{
type: "category",
label: "LiteLLM Proxy Server",
label: "💥 LiteLLM Proxy Server",
link: {
type: "generated-index",
title: "LiteLLM Proxy Server (LLM Gateway)",
title: "💥 LiteLLM Proxy Server (LLM Gateway)",
description: `OpenAI Proxy Server (LLM Gateway) to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
slug: "/simple_proxy",
},
items: [
"proxy/quick_start",
"proxy/docker_quick_start",
{
"type": "category",
"label": "Config.yaml",
"items": ["proxy/configs", "proxy/config_management", "proxy/config_settings"]
},
{
type: "category",
label: "Setup & Deployment",
items: [
"proxy/deploy",
"proxy/prod",
"proxy/cli",
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/pass_through",
],
},
"proxy/demo",
"proxy/prod",
{
type: "category",
label: "Architecture",
items: ["proxy/architecture", "proxy/db_info", "router_architecture"],
items: ["proxy/architecture"],
},
{
type: "link",
label: "All Endpoints (Swagger)",
label: "📖 All Endpoints (Swagger)",
href: "https://litellm-api.up.railway.app/",
},
"proxy/enterprise",
"proxy/user_keys",
"proxy/configs",
"proxy/response_headers",
"proxy/reliability",
{
type: "category",
label: "Making LLM Requests",
items: [
"proxy/user_keys",
"proxy/response_headers",
"pass_through/vertex_ai",
"pass_through/google_ai_studio",
"pass_through/cohere",
"pass_through/anthropic_completion",
"pass_through/bedrock",
"pass_through/langfuse"
],
label: "🔑 Authentication",
items: ["proxy/virtual_keys", "proxy/token_auth", "proxy/service_accounts", "proxy/access_control","proxy/ip_address"],
},
{
type: "category",
label: "Authentication",
label: "💸 Spend Tracking + Budgets",
items: ["proxy/cost_tracking", "proxy/users", "proxy/custom_pricing", "proxy/team_budgets", "proxy/billing", "proxy/customers"],
},
{
type: "category",
label: "Routing",
items: ["proxy/load_balancing", "proxy/tag_routing", "proxy/team_based_routing", "proxy/customer_routing",],
},
{
type: "category",
label: "Use with Provider SDKs",
items: [
"proxy/virtual_keys",
"proxy/token_auth",
"proxy/service_accounts",
"proxy/access_control",
"proxy/ip_address",
"proxy/email",
"proxy/multiple_admins",
"pass_through/vertex_ai",
"pass_through/google_ai_studio",
"pass_through/cohere",
"anthropic_completion",
"pass_through/bedrock",
"pass_through/langfuse"
],
},
{
@ -97,22 +86,12 @@ const sidebars = {
},
{
type: "category",
label: "Spend Tracking + Budgets",
items: ["proxy/cost_tracking", "proxy/users", "proxy/custom_pricing", "proxy/team_budgets", "proxy/billing", "proxy/customers"],
},
{
type: "link",
label: "Load Balancing, Routing, Fallbacks",
href: "https://docs.litellm.ai/docs/routing-load-balancing",
label: "🪢 Logging, Alerting, Metrics",
items: ["proxy/logging", "proxy/bucket", "proxy/team_logging","proxy/streaming_logging", "proxy/alerting", "proxy/prometheus",],
},
{
type: "category",
label: "Logging, Alerting, Metrics",
items: ["proxy/logging", "proxy/team_logging","proxy/alerting", "proxy/prometheus",],
},
{
type: "category",
label: "[Beta] Guardrails",
label: "🛡️ [Beta] Guardrails",
items: [
"proxy/guardrails/quick_start",
"proxy/guardrails/aporia_api",
@ -127,20 +106,27 @@ const sidebars = {
},
{
type: "category",
label: "Secret Managers",
label: "Secret Manager - storing LLM API Keys",
items: [
"secret",
"oidc"
]
},
"proxy/caching",
"proxy/pass_through",
"proxy/email",
"proxy/multiple_admins",
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/call_hooks",
"proxy/rules",
"proxy/cli",
]
},
{
type: "category",
label: "Supported Models & Providers",
label: "💯 Supported Models & Providers",
link: {
type: "generated-index",
title: "Providers",
@ -197,6 +183,7 @@ const sidebars = {
"providers/openrouter",
"providers/palm",
"providers/sambanova",
// "providers/custom_openai_proxy",
"providers/custom_llm_server",
"providers/petals",
@ -204,36 +191,7 @@ const sidebars = {
},
{
type: "category",
label: "Guides",
items: [
"exception_mapping",
"completion/provider_specific_params",
"guides/finetuned_models",
"completion/audio",
"completion/vision",
"completion/json_mode",
"completion/prompt_caching",
"completion/predict_outputs",
"completion/prefix",
"completion/drop_params",
"completion/prompt_formatting",
"completion/stream",
"completion/message_trimming",
"completion/function_call",
"completion/model_alias",
"completion/batching",
"completion/mock_requests",
"completion/reliable_completions",
]
},
{
type: "category",
label: "Supported Endpoints",
items: [
{
type: "category",
label: "Chat",
label: "Chat Completions (litellm.completion + PROXY)",
link: {
type: "generated-index",
title: "Chat Completions",
@ -242,27 +200,40 @@ const sidebars = {
},
items: [
"completion/input",
"completion/provider_specific_params",
"completion/json_mode",
"completion/prompt_caching",
"completion/audio",
"completion/vision",
"completion/predict_outputs",
"completion/prefix",
"completion/drop_params",
"completion/prompt_formatting",
"completion/output",
"completion/usage",
"exception_mapping",
"completion/stream",
"completion/message_trimming",
"completion/function_call",
"completion/model_alias",
"completion/batching",
"completion/mock_requests",
"completion/reliable_completions",
],
},
"text_completion",
"embedding/supported_embedding",
"image_generation",
{
type: "category",
label: "Audio",
"items": [
label: "Supported Endpoints - /images, /audio/speech, /assistants etc",
items: [
"embedding/supported_embedding",
"image_generation",
"audio_transcription",
"text_to_speech",
]
},
"rerank",
"assistants",
"batches",
"realtime",
"fine_tuning",
"moderation",
{
type: "link",
label: "Use LiteLLM Proxy with Vertex, Bedrock SDK",
@ -270,20 +241,11 @@ const sidebars = {
},
],
},
"routing",
"scheduler",
{
type: "category",
label: "Routing, Loadbalancing & Fallbacks",
link: {
type: "generated-index",
title: "Routing, Loadbalancing & Fallbacks",
description: "Learn how to load balance, route, and set fallbacks for your LLM requests",
slug: "/routing-load-balancing",
},
items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"],
},
{
type: "category",
label: "LiteLLM Python SDK",
label: "🚅 LiteLLM Python SDK",
items: [
"set_keys",
"completion/token_usage",
@ -304,7 +266,6 @@ const sidebars = {
type: "category",
label: "Load Testing",
items: [
"benchmarks",
"load_test",
"load_test_advanced",
"load_test_sdk",

View file

@ -2,9 +2,7 @@
from typing import Optional, List
from litellm._logging import verbose_logger
from litellm.proxy.proxy_server import PrismaClient, HTTPException
from litellm.llms.custom_httpx.http_handler import HTTPHandler
import collections
import httpx
from datetime import datetime
@ -116,6 +114,7 @@ async def ui_get_spend_by_tags(
def _forecast_daily_cost(data: list):
import requests # type: ignore
from datetime import datetime, timedelta
if len(data) == 0:
@ -137,17 +136,17 @@ def _forecast_daily_cost(data: list):
print("last entry date", last_entry_date)
# Assuming today_date is a datetime object
today_date = datetime.now()
# Calculate the last day of the month
last_day_of_todays_month = datetime(
today_date.year, today_date.month % 12 + 1, 1
) - timedelta(days=1)
print("last day of todays month", last_day_of_todays_month)
# Calculate the remaining days in the month
remaining_days = (last_day_of_todays_month - last_entry_date).days
print("remaining days", remaining_days)
current_spend_this_month = 0
series = {}
for entry in data:
@ -177,19 +176,13 @@ def _forecast_daily_cost(data: list):
"Content-Type": "application/json",
}
client = HTTPHandler()
try:
response = client.post(
response = requests.post(
url="https://trend-api-production.up.railway.app/forecast",
json=payload,
headers=headers,
)
except httpx.HTTPStatusError as e:
raise HTTPException(
status_code=500,
detail={"error": f"Error getting forecast: {e.response.text}"},
)
# check the status code
response.raise_for_status()
json_response = response.json()
forecast_data = json_response["forecast"]
@ -213,3 +206,13 @@ def _forecast_daily_cost(data: list):
f"Predicted Spend for { today_month } 2024, ${total_predicted_spend}"
)
return {"response": response_data, "predicted_spend": predicted_spend}
# print(f"Date: {entry['date']}, Spend: {entry['spend']}, Response: {response.text}")
# _forecast_daily_cost(
# [
# {"date": "2022-01-01", "spend": 100},
# ]
# )

View file

@ -24,7 +24,6 @@ from litellm.proxy._types import (
KeyManagementSettings,
LiteLLM_UpperboundKeyGenerateParams,
)
from litellm.types.utils import StandardKeyGenerationConfig
import httpx
import dotenv
from enum import Enum
@ -58,7 +57,6 @@ _custom_logger_compatible_callbacks_literal = Literal[
"gcs_bucket",
"opik",
"argilla",
"mlflow",
]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
_known_custom_logger_compatible_callbacks: List = list(
@ -68,7 +66,6 @@ callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] =
langfuse_default_tags: Optional[List[str]] = None
langsmith_batch_size: Optional[int] = None
argilla_batch_size: Optional[int] = None
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
argilla_transformation_object: Optional[Dict[str, Any]] = None
_async_input_callback: List[Callable] = (
[]
@ -135,7 +132,7 @@ use_client: bool = False
ssl_verify: Union[str, bool] = True
ssl_certificate: Optional[str] = None
disable_streaming_logging: bool = False
in_memory_llm_clients_cache: InMemoryCache = InMemoryCache()
in_memory_llm_clients_cache: dict = {}
safe_memory_mode: bool = False
enable_azure_ad_token_refresh: Optional[bool] = False
### DEFAULT AZURE API VERSION ###
@ -275,7 +272,6 @@ s3_callback_params: Optional[Dict] = None
generic_logger_headers: Optional[Dict] = None
default_key_generate_params: Optional[Dict] = None
upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None
key_generation_settings: Optional[StandardKeyGenerationConfig] = None
default_internal_user_params: Optional[Dict] = None
default_team_settings: Optional[List] = None
max_user_budget: Optional[float] = None
@ -283,23 +279,15 @@ default_max_internal_user_budget: Optional[float] = None
max_internal_user_budget: Optional[float] = None
internal_user_budget_duration: Optional[str] = None
max_end_user_budget: Optional[float] = None
disable_end_user_cost_tracking: Optional[bool] = None
#### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
#### Networking settings ####
request_timeout: float = 6000 # time in seconds
force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
)
module_level_aclient = AsyncHTTPHandler(
timeout=request_timeout, client_alias="module level aclient"
)
module_level_client = HTTPHandler(timeout=request_timeout)
#### RETRIES ####
num_retries: Optional[int] = None # per model endpoint
max_fallbacks: Optional[int] = None
default_fallbacks: Optional[List] = None
@ -316,7 +304,7 @@ secret_manager_client: Optional[Any] = (
)
_google_kms_resource_name: Optional[str] = None
_key_management_system: Optional[KeyManagementSystem] = None
_key_management_settings: KeyManagementSettings = KeyManagementSettings()
_key_management_settings: Optional[KeyManagementSettings] = None
#### PII MASKING ####
output_parse_pii: bool = False
#############################################
@ -387,7 +375,6 @@ open_ai_text_completion_models: List = []
cohere_models: List = []
cohere_chat_models: List = []
mistral_chat_models: List = []
text_completion_codestral_models: List = []
anthropic_models: List = []
empower_models: List = []
openrouter_models: List = []
@ -414,19 +401,6 @@ deepinfra_models: List = []
perplexity_models: List = []
watsonx_models: List = []
gemini_models: List = []
xai_models: List = []
deepseek_models: List = []
azure_ai_models: List = []
voyage_models: List = []
databricks_models: List = []
cloudflare_models: List = []
codestral_models: List = []
friendliai_models: List = []
palm_models: List = []
groq_models: List = []
azure_models: List = []
anyscale_models: List = []
cerebras_models: List = []
def add_known_models():
@ -503,34 +477,6 @@ def add_known_models():
# ignore the 'up-to', '-to-' model names -> not real models. just for cost tracking based on model params.
if "-to-" not in key:
fireworks_ai_embedding_models.append(key)
elif value.get("litellm_provider") == "text-completion-codestral":
text_completion_codestral_models.append(key)
elif value.get("litellm_provider") == "xai":
xai_models.append(key)
elif value.get("litellm_provider") == "deepseek":
deepseek_models.append(key)
elif value.get("litellm_provider") == "azure_ai":
azure_ai_models.append(key)
elif value.get("litellm_provider") == "voyage":
voyage_models.append(key)
elif value.get("litellm_provider") == "databricks":
databricks_models.append(key)
elif value.get("litellm_provider") == "cloudflare":
cloudflare_models.append(key)
elif value.get("litellm_provider") == "codestral":
codestral_models.append(key)
elif value.get("litellm_provider") == "friendliai":
friendliai_models.append(key)
elif value.get("litellm_provider") == "palm":
palm_models.append(key)
elif value.get("litellm_provider") == "groq":
groq_models.append(key)
elif value.get("litellm_provider") == "azure":
azure_models.append(key)
elif value.get("litellm_provider") == "anyscale":
anyscale_models.append(key)
elif value.get("litellm_provider") == "cerebras":
cerebras_models.append(key)
add_known_models()
@ -776,20 +722,6 @@ model_list = (
+ vertex_language_models
+ watsonx_models
+ gemini_models
+ text_completion_codestral_models
+ xai_models
+ deepseek_models
+ azure_ai_models
+ voyage_models
+ databricks_models
+ cloudflare_models
+ codestral_models
+ friendliai_models
+ palm_models
+ groq_models
+ azure_models
+ anyscale_models
+ cerebras_models
)
@ -846,7 +778,6 @@ class LlmProviders(str, Enum):
FIREWORKS_AI = "fireworks_ai"
FRIENDLIAI = "friendliai"
WATSONX = "watsonx"
WATSONX_TEXT = "watsonx_text"
TRITON = "triton"
PREDIBASE = "predibase"
DATABRICKS = "databricks"
@ -863,7 +794,6 @@ provider_list: List[Union[LlmProviders, str]] = list(LlmProviders)
models_by_provider: dict = {
"openai": open_ai_chat_completion_models + open_ai_text_completion_models,
"text-completion-openai": open_ai_text_completion_models,
"cohere": cohere_models + cohere_chat_models,
"cohere_chat": cohere_chat_models,
"anthropic": anthropic_models,
@ -887,23 +817,6 @@ models_by_provider: dict = {
"watsonx": watsonx_models,
"gemini": gemini_models,
"fireworks_ai": fireworks_ai_models + fireworks_ai_embedding_models,
"aleph_alpha": aleph_alpha_models,
"text-completion-codestral": text_completion_codestral_models,
"xai": xai_models,
"deepseek": deepseek_models,
"mistral": mistral_chat_models,
"azure_ai": azure_ai_models,
"voyage": voyage_models,
"databricks": databricks_models,
"cloudflare": cloudflare_models,
"codestral": codestral_models,
"nlp_cloud": nlp_cloud_models,
"friendliai": friendliai_models,
"palm": palm_models,
"groq": groq_models,
"azure": azure_models,
"anyscale": anyscale_models,
"cerebras": cerebras_models,
}
# mapping for those models which have larger equivalents
@ -973,11 +886,10 @@ from .utils import (
supports_response_schema,
supports_parallel_function_calling,
supports_vision,
supports_audio_input,
supports_audio_output,
supports_system_messages,
get_litellm_params,
acreate,
get_model_list,
get_max_tokens,
get_model_info,
register_prompt_template,
@ -1072,11 +984,10 @@ from .llms.bedrock.common_utils import (
AmazonAnthropicClaude3Config,
AmazonCohereConfig,
AmazonLlamaConfig,
AmazonStabilityConfig,
AmazonMistralConfig,
AmazonBedrockGlobalConfig,
)
from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
AmazonTitanMultimodalEmbeddingG1Config,
@ -1134,9 +1045,7 @@ from .llms.AzureOpenAI.azure import (
from .llms.AzureOpenAI.chat.gpt_transformation import AzureOpenAIConfig
from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig
from .llms.deepseek.chat.transformation import DeepSeekChatConfig
from .llms.lm_studio.chat.transformation import LMStudioChatConfig
from .llms.lm_studio.embed.transformation import LmStudioEmbeddingConfig
from .llms.perplexity.chat.transformation import PerplexityChatConfig
from .llms.AzureOpenAI.chat.o1_transformation import AzureOpenAIO1Config
from .llms.watsonx.completion.handler import IBMWatsonXAIConfig

View file

@ -12,13 +12,13 @@ import json
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
import os
from typing import Dict, List, Optional, Union
from typing import List, Optional, Union
import redis # type: ignore
import redis.asyncio as async_redis # type: ignore
import litellm
from litellm import get_secret, get_secret_str
from litellm import get_secret
from ._logging import verbose_logger
@ -141,13 +141,6 @@ def _get_redis_client_logic(**env_overrides):
if _sentinel_nodes is not None and isinstance(_sentinel_nodes, str):
redis_kwargs["sentinel_nodes"] = json.loads(_sentinel_nodes)
_sentinel_password: Optional[str] = redis_kwargs.get(
"sentinel_password", None
) or get_secret_str("REDIS_SENTINEL_PASSWORD")
if _sentinel_password is not None:
redis_kwargs["sentinel_password"] = _sentinel_password
_service_name: Optional[str] = redis_kwargs.get("service_name", None) or get_secret( # type: ignore
"REDIS_SERVICE_NAME"
)
@ -224,7 +217,6 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
sentinel_nodes = redis_kwargs.get("sentinel_nodes")
sentinel_password = redis_kwargs.get("sentinel_password")
service_name = redis_kwargs.get("service_name")
if not sentinel_nodes or not service_name:
@ -235,11 +227,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
verbose_logger.debug("init_redis_sentinel: sentinel nodes are being initialized.")
# Set up the Sentinel client
sentinel = async_redis.Sentinel(
sentinel_nodes,
socket_timeout=0.1,
password=sentinel_password,
)
sentinel = async_redis.Sentinel(sentinel_nodes, socket_timeout=0.1)
# Return the master instance for the given service
@ -313,13 +301,12 @@ def get_redis_async_client(**env_overrides) -> async_redis.Redis:
def get_redis_connection_pool(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return async_redis.BlockingConnectionPool.from_url(
timeout=5, url=redis_kwargs["url"]
)
connection_class = async_redis.Connection
if "ssl" in redis_kwargs:
if "ssl" in redis_kwargs and redis_kwargs["ssl"] is not None:
connection_class = async_redis.SSLConnection
redis_kwargs.pop("ssl", None)
redis_kwargs["connection_class"] = connection_class

View file

@ -8,7 +8,6 @@ Has 4 methods:
- async_get_cache
"""
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Optional
if TYPE_CHECKING:
@ -19,7 +18,7 @@ else:
Span = Any
class BaseCache(ABC):
class BaseCache:
def __init__(self, default_ttl: int = 60):
self.default_ttl = default_ttl
@ -38,10 +37,6 @@ class BaseCache(ABC):
async def async_set_cache(self, key, value, **kwargs):
raise NotImplementedError
@abstractmethod
async def async_set_cache_pipeline(self, cache_list, **kwargs):
pass
def get_cache(self, key, **kwargs):
raise NotImplementedError

View file

@ -233,18 +233,19 @@ class Cache:
if self.namespace is not None and isinstance(self.cache, RedisCache):
self.cache.namespace = self.namespace
def get_cache_key(self, **kwargs) -> str:
def get_cache_key(self, *args, **kwargs) -> str:
"""
Get the cache key for the given arguments.
Args:
*args: args to litellm.completion() or embedding()
**kwargs: kwargs to litellm.completion() or embedding()
Returns:
str: The cache key generated from the arguments, or None if no cache key could be generated.
"""
cache_key = ""
# verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs)
verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs)
preset_cache_key = self._get_preset_cache_key_from_kwargs(**kwargs)
if preset_cache_key is not None:
@ -520,7 +521,7 @@ class Cache:
return cached_response
return cached_result
def get_cache(self, **kwargs):
def get_cache(self, *args, **kwargs):
"""
Retrieves the cached result for the given arguments.
@ -532,13 +533,13 @@ class Cache:
The cached result if it exists, otherwise None.
"""
try: # never block execution
if self.should_use_cache(**kwargs) is not True:
if self.should_use_cache(*args, **kwargs) is not True:
return
messages = kwargs.get("messages", [])
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
cache_key = self.get_cache_key(**kwargs)
cache_key = self.get_cache_key(*args, **kwargs)
if cache_key is not None:
cache_control_args = kwargs.get("cache", {})
max_age = cache_control_args.get(
@ -552,28 +553,29 @@ class Cache:
print_verbose(f"An exception occurred: {traceback.format_exc()}")
return None
async def async_get_cache(self, **kwargs):
async def async_get_cache(self, *args, **kwargs):
"""
Async get cache implementation.
Used for embedding calls in async wrapper
"""
try: # never block execution
if self.should_use_cache(**kwargs) is not True:
if self.should_use_cache(*args, **kwargs) is not True:
return
kwargs.get("messages", [])
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
cache_key = self.get_cache_key(**kwargs)
cache_key = self.get_cache_key(*args, **kwargs)
if cache_key is not None:
cache_control_args = kwargs.get("cache", {})
max_age = cache_control_args.get(
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
)
cached_result = await self.cache.async_get_cache(cache_key, **kwargs)
cached_result = await self.cache.async_get_cache(
cache_key, *args, **kwargs
)
return self._get_cache_logic(
cached_result=cached_result, max_age=max_age
)
@ -581,7 +583,7 @@ class Cache:
print_verbose(f"An exception occurred: {traceback.format_exc()}")
return None
def _add_cache_logic(self, result, **kwargs):
def _add_cache_logic(self, result, *args, **kwargs):
"""
Common implementation across sync + async add_cache functions
"""
@ -589,7 +591,7 @@ class Cache:
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
cache_key = self.get_cache_key(**kwargs)
cache_key = self.get_cache_key(*args, **kwargs)
if cache_key is not None:
if isinstance(result, BaseModel):
result = result.model_dump_json()
@ -611,7 +613,7 @@ class Cache:
except Exception as e:
raise e
def add_cache(self, result, **kwargs):
def add_cache(self, result, *args, **kwargs):
"""
Adds a result to the cache.
@ -623,42 +625,41 @@ class Cache:
None
"""
try:
if self.should_use_cache(**kwargs) is not True:
if self.should_use_cache(*args, **kwargs) is not True:
return
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, **kwargs
result=result, *args, **kwargs
)
self.cache.set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
async def async_add_cache(self, result, **kwargs):
async def async_add_cache(self, result, *args, **kwargs):
"""
Async implementation of add_cache
"""
try:
if self.should_use_cache(**kwargs) is not True:
if self.should_use_cache(*args, **kwargs) is not True:
return
if self.type == "redis" and self.redis_flush_size is not None:
# high traffic - fill in results in memory and then flush
await self.batch_cache_write(result, **kwargs)
await self.batch_cache_write(result, *args, **kwargs)
else:
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, **kwargs
result=result, *args, **kwargs
)
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
async def async_add_cache_pipeline(self, result, **kwargs):
async def async_add_cache_pipeline(self, result, *args, **kwargs):
"""
Async implementation of add_cache for Embedding calls
Does a bulk write, to prevent using too many clients
"""
try:
if self.should_use_cache(**kwargs) is not True:
if self.should_use_cache(*args, **kwargs) is not True:
return
# set default ttl if not set
@ -667,27 +668,29 @@ class Cache:
cache_list = []
for idx, i in enumerate(kwargs["input"]):
preset_cache_key = self.get_cache_key(**{**kwargs, "input": i})
preset_cache_key = self.get_cache_key(*args, **{**kwargs, "input": i})
kwargs["cache_key"] = preset_cache_key
embedding_response = result.data[idx]
cache_key, cached_data, kwargs = self._add_cache_logic(
result=embedding_response,
*args,
**kwargs,
)
cache_list.append((cache_key, cached_data))
await self.cache.async_set_cache_pipeline(cache_list=cache_list, **kwargs)
# if async_set_cache_pipeline:
# await async_set_cache_pipeline(cache_list=cache_list, **kwargs)
# else:
# tasks = []
# for val in cache_list:
# tasks.append(self.cache.async_set_cache(val[0], val[1], **kwargs))
# await asyncio.gather(*tasks)
async_set_cache_pipeline = getattr(
self.cache, "async_set_cache_pipeline", None
)
if async_set_cache_pipeline:
await async_set_cache_pipeline(cache_list=cache_list, **kwargs)
else:
tasks = []
for val in cache_list:
tasks.append(self.cache.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)
except Exception as e:
verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
def should_use_cache(self, **kwargs):
def should_use_cache(self, *args, **kwargs):
"""
Returns true if we should use the cache for LLM API calls
@ -705,8 +708,10 @@ class Cache:
return True
return False
async def batch_cache_write(self, result, **kwargs):
cache_key, cached_data, kwargs = self._add_cache_logic(result=result, **kwargs)
async def batch_cache_write(self, result, *args, **kwargs):
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
await self.cache.batch_cache_write(cache_key, cached_data, **kwargs)
async def ping(self):

View file

@ -137,7 +137,7 @@ class LLMCachingHandler:
if litellm.cache is not None and self._is_call_type_supported_by_cache(
original_function=original_function
):
verbose_logger.debug("Checking Cache")
print_verbose("Checking Cache")
cached_result = await self._retrieve_from_cache(
call_type=call_type,
kwargs=kwargs,
@ -145,7 +145,7 @@ class LLMCachingHandler:
)
if cached_result is not None and not isinstance(cached_result, list):
verbose_logger.debug("Cache Hit!")
print_verbose("Cache Hit!")
cache_hit = True
end_time = datetime.datetime.now()
model, _, _, _ = litellm.get_llm_provider(
@ -215,7 +215,6 @@ class LLMCachingHandler:
final_embedding_cached_response=final_embedding_cached_response,
embedding_all_elements_cache_hit=embedding_all_elements_cache_hit,
)
verbose_logger.debug(f"CACHE RESULT: {cached_result}")
return CachingHandlerResponse(
cached_result=cached_result,
final_embedding_cached_response=final_embedding_cached_response,
@ -234,19 +233,12 @@ class LLMCachingHandler:
from litellm.utils import CustomStreamWrapper
args = args or ()
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
self.original_function,
args,
)
)
cached_result: Optional[Any] = None
if litellm.cache is not None and self._is_call_type_supported_by_cache(
original_function=original_function
):
print_verbose("Checking Cache")
cached_result = litellm.cache.get_cache(**new_kwargs)
cached_result = litellm.cache.get_cache(*args, **kwargs)
if cached_result is not None:
if "detail" in cached_result:
# implies an error occurred
@ -483,21 +475,14 @@ class LLMCachingHandler:
if litellm.cache is None:
return None
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
self.original_function,
args,
)
)
cached_result: Optional[Any] = None
if call_type == CallTypes.aembedding.value and isinstance(
new_kwargs["input"], list
kwargs["input"], list
):
tasks = []
for idx, i in enumerate(new_kwargs["input"]):
for idx, i in enumerate(kwargs["input"]):
preset_cache_key = litellm.cache.get_cache_key(
**{**new_kwargs, "input": i}
*args, **{**kwargs, "input": i}
)
tasks.append(litellm.cache.async_get_cache(cache_key=preset_cache_key))
cached_result = await asyncio.gather(*tasks)
@ -508,9 +493,9 @@ class LLMCachingHandler:
cached_result = None
else:
if litellm.cache._supports_async() is True:
cached_result = await litellm.cache.async_get_cache(**new_kwargs)
cached_result = await litellm.cache.async_get_cache(*args, **kwargs)
else: # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
cached_result = litellm.cache.get_cache(**new_kwargs)
cached_result = litellm.cache.get_cache(*args, **kwargs)
return cached_result
def _convert_cached_result_to_model_response(
@ -595,7 +580,6 @@ class LLMCachingHandler:
model_response_object=EmbeddingResponse(),
response_type="embedding",
)
elif (
call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
) and isinstance(cached_result, dict):
@ -619,13 +603,6 @@ class LLMCachingHandler:
response_type="audio_transcription",
hidden_params=hidden_params,
)
if (
hasattr(cached_result, "_hidden_params")
and cached_result._hidden_params is not None
and isinstance(cached_result._hidden_params, dict)
):
cached_result._hidden_params["cache_hit"] = True
return cached_result
def _convert_cached_stream_response(
@ -681,19 +658,12 @@ class LLMCachingHandler:
Raises:
None
"""
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
original_function,
args,
)
)
kwargs.update(convert_args_to_kwargs(result, original_function, kwargs, args))
if litellm.cache is None:
return
# [OPTIONAL] ADD TO CACHE
if self._should_store_result_in_cache(
original_function=original_function, kwargs=new_kwargs
original_function=original_function, kwargs=kwargs
):
if (
isinstance(result, litellm.ModelResponse)
@ -703,29 +673,29 @@ class LLMCachingHandler:
):
if (
isinstance(result, EmbeddingResponse)
and isinstance(new_kwargs["input"], list)
and isinstance(kwargs["input"], list)
and litellm.cache is not None
and not isinstance(
litellm.cache.cache, S3Cache
) # s3 doesn't support bulk writing. Exclude.
):
asyncio.create_task(
litellm.cache.async_add_cache_pipeline(result, **new_kwargs)
litellm.cache.async_add_cache_pipeline(result, **kwargs)
)
elif isinstance(litellm.cache.cache, S3Cache):
threading.Thread(
target=litellm.cache.add_cache,
args=(result,),
kwargs=new_kwargs,
kwargs=kwargs,
).start()
else:
asyncio.create_task(
litellm.cache.async_add_cache(
result.model_dump_json(), **new_kwargs
result.model_dump_json(), **kwargs
)
)
else:
asyncio.create_task(litellm.cache.async_add_cache(result, **new_kwargs))
asyncio.create_task(litellm.cache.async_add_cache(result, **kwargs))
def sync_set_cache(
self,
@ -736,20 +706,16 @@ class LLMCachingHandler:
"""
Sync internal method to add the result to the cache
"""
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
self.original_function,
args,
)
kwargs.update(
convert_args_to_kwargs(result, self.original_function, kwargs, args)
)
if litellm.cache is None:
return
if self._should_store_result_in_cache(
original_function=self.original_function, kwargs=new_kwargs
original_function=self.original_function, kwargs=kwargs
):
litellm.cache.add_cache(result, **new_kwargs)
litellm.cache.add_cache(result, **kwargs)
return
@ -899,7 +865,9 @@ class LLMCachingHandler:
def convert_args_to_kwargs(
result: Any,
original_function: Callable,
kwargs: Dict[str, Any],
args: Optional[Tuple[Any, ...]] = None,
) -> Dict[str, Any]:
# Get the signature of the original function

View file

@ -24,6 +24,7 @@ class DiskCache(BaseCache):
self.disk_cache = dc.Cache(disk_cache_dir)
def set_cache(self, key, value, **kwargs):
print_verbose("DiskCache: set_cache")
if "ttl" in kwargs:
self.disk_cache.set(key, value, expire=kwargs["ttl"])
else:
@ -32,10 +33,10 @@ class DiskCache(BaseCache):
async def async_set_cache(self, key, value, **kwargs):
self.set_cache(key=key, value=value, **kwargs)
async def async_set_cache_pipeline(self, cache_list, **kwargs):
async def async_set_cache_pipeline(self, cache_list, ttl=None):
for cache_key, cache_value in cache_list:
if "ttl" in kwargs:
self.set_cache(key=cache_key, value=cache_value, ttl=kwargs["ttl"])
if ttl is not None:
self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
else:
self.set_cache(key=cache_key, value=cache_value)

View file

@ -314,8 +314,7 @@ class DualCache(BaseCache):
f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
)
# async_batch_set_cache
async def async_set_cache_pipeline(
async def async_batch_set_cache(
self, cache_list: list, local_only: bool = False, **kwargs
):
"""

View file

@ -9,7 +9,6 @@ Has 4 methods:
"""
import ast
import asyncio
import json
from typing import Any
@ -423,9 +422,3 @@ class QdrantSemanticCache(BaseCache):
async def _collection_info(self):
return self.collection_info
async def async_set_cache_pipeline(self, cache_list, **kwargs):
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)

View file

@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Tuple
import litellm
from litellm._logging import print_verbose, verbose_logger
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
from litellm.types.caching import RedisPipelineIncrementOperation
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
from litellm.types.utils import all_litellm_params
@ -405,7 +404,7 @@ class RedisCache(BaseCache):
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
)
)
return None
return results
except Exception as e:
## LOGGING ##
end_time = time.time()
@ -891,92 +890,3 @@ class RedisCache(BaseCache):
def delete_cache(self, key):
self.redis_client.delete(key)
async def _pipeline_increment_helper(
self,
pipe: pipeline,
increment_list: List[RedisPipelineIncrementOperation],
) -> Optional[List[float]]:
"""Helper function for pipeline increment operations"""
# Iterate through each increment operation and add commands to pipeline
for increment_op in increment_list:
cache_key = self.check_and_fix_namespace(key=increment_op["key"])
print_verbose(
f"Increment ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {increment_op['increment_value']}\nttl={increment_op['ttl']}"
)
pipe.incrbyfloat(cache_key, increment_op["increment_value"])
if increment_op["ttl"] is not None:
_td = timedelta(seconds=increment_op["ttl"])
pipe.expire(cache_key, _td)
# Execute the pipeline and return results
results = await pipe.execute()
print_verbose(f"Increment ASYNC Redis Cache PIPELINE: results: {results}")
return results
async def async_increment_pipeline(
self, increment_list: List[RedisPipelineIncrementOperation], **kwargs
) -> Optional[List[float]]:
"""
Use Redis Pipelines for bulk increment operations
Args:
increment_list: List of RedisPipelineIncrementOperation dicts containing:
- key: str
- increment_value: float
- ttl_seconds: int
"""
# don't waste a network request if there's nothing to increment
if len(increment_list) == 0:
return None
from redis.asyncio import Redis
_redis_client: Redis = self.init_async_client() # type: ignore
start_time = time.time()
print_verbose(
f"Increment Async Redis Cache Pipeline: increment list: {increment_list}"
)
try:
async with _redis_client as redis_client:
async with redis_client.pipeline(transaction=True) as pipe:
results = await self._pipeline_increment_helper(
pipe, increment_list
)
print_verbose(f"pipeline increment results: {results}")
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_increment_pipeline",
start_time=start_time,
end_time=end_time,
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
)
)
return results
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_increment_pipeline",
start_time=start_time,
end_time=end_time,
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
)
)
verbose_logger.error(
"LiteLLM Redis Caching: async increment_pipeline() - Got exception from REDIS %s",
str(e),
)
raise e

View file

@ -9,7 +9,6 @@ Has 4 methods:
"""
import ast
import asyncio
import json
from typing import Any
@ -332,9 +331,3 @@ class RedisSemanticCache(BaseCache):
async def _index_info(self):
return await self.index.ainfo()
async def async_set_cache_pipeline(self, cache_list, **kwargs):
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)

View file

@ -10,7 +10,6 @@ Has 4 methods:
"""
import ast
import asyncio
import json
from typing import Any, Optional
@ -154,9 +153,3 @@ class S3Cache(BaseCache):
async def disconnect(self):
pass
async def async_set_cache_pipeline(self, cache_list, **kwargs):
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)

View file

@ -28,9 +28,6 @@ from litellm.llms.azure_ai.cost_calculator import (
from litellm.llms.AzureOpenAI.cost_calculation import (
cost_per_token as azure_openai_cost_per_token,
)
from litellm.llms.bedrock.image.cost_calculator import (
cost_calculator as bedrock_image_cost_calculator,
)
from litellm.llms.cohere.cost_calculator import (
cost_per_query as cohere_rerank_cost_per_query,
)
@ -46,9 +43,6 @@ from litellm.llms.OpenAI.cost_calculation import (
from litellm.llms.OpenAI.cost_calculation import cost_per_token as openai_cost_per_token
from litellm.llms.OpenAI.cost_calculation import cost_router as openai_cost_router
from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
from litellm.llms.vertex_ai_and_google_ai_studio.image_generation.cost_calculator import (
cost_calculator as vertex_ai_image_cost_calculator,
)
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.rerank import RerankResponse
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -174,6 +168,7 @@ def cost_per_token( # noqa: PLR0915
model_with_provider = model_with_provider_and_region
else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model
model_parts = model.split("/", 1)
if len(model_parts) > 1:
@ -456,6 +451,7 @@ def _select_model_name_for_cost_calc(
if base_model is not None:
return base_model
return_model = model
if isinstance(completion_response, str):
return return_model
@ -525,13 +521,12 @@ def completion_cost( # noqa: PLR0915
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size: Optional[str] = None,
size=None,
quality=None,
n=None, # number of images
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
optional_params: Optional[dict] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -621,8 +616,7 @@ def completion_cost( # noqa: PLR0915
f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
)
model = _select_model_name_for_cost_calc(
model=model,
completion_response=completion_response,
model=model, completion_response=completion_response
)
hidden_params = getattr(completion_response, "_hidden_params", None)
if hidden_params is not None:
@ -670,22 +664,10 @@ def completion_cost( # noqa: PLR0915
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
if isinstance(completion_response, ImageResponse):
return vertex_ai_image_cost_calculator(
model=model,
image_response=completion_response,
)
elif custom_llm_provider == "bedrock":
if isinstance(completion_response, ImageResponse):
return bedrock_image_cost_calculator(
model=model,
size=size,
image_response=completion_response,
optional_params=optional_params,
)
raise TypeError(
"completion_response must be of type ImageResponse for bedrock image cost calculation"
)
# https://cloud.google.com/vertex-ai/generative-ai/pricing
# Vertex Charges Flat $0.20 per image
return 0.020
if size is None:
size = "1024-x-1024" # openai default
# fix size to match naming convention
@ -695,9 +677,9 @@ def completion_cost( # noqa: PLR0915
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size_parts = size.split("-x-")
height = int(size_parts[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size_parts[1])
size = size.split("-x-")
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
@ -857,14 +839,11 @@ def response_cost_calculator(
if isinstance(response_object, BaseModel):
response_object._hidden_params["optional_params"] = optional_params
if isinstance(response_object, ImageResponse):
if base_model is not None:
model = base_model
response_cost = completion_cost(
completion_response=response_object,
model=model,
call_type=call_type,
custom_llm_provider=custom_llm_provider,
optional_params=optional_params,
)
else:
if custom_pricing is True: # override defaults if custom pricing is set

View file

@ -423,7 +423,7 @@ class SlackAlerting(CustomBatchLogger):
latency_cache_keys = [(key, 0) for key in latency_keys]
failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
await self.internal_usage_cache.async_set_cache_pipeline(
await self.internal_usage_cache.async_batch_set_cache(
cache_list=combined_metrics_cache_keys
)

View file

@ -21,7 +21,6 @@ class CustomBatchLogger(CustomLogger):
self,
flush_lock: Optional[asyncio.Lock] = None,
batch_size: Optional[int] = DEFAULT_BATCH_SIZE,
flush_interval: Optional[int] = DEFAULT_FLUSH_INTERVAL_SECONDS,
**kwargs,
) -> None:
"""
@ -29,7 +28,7 @@ class CustomBatchLogger(CustomLogger):
flush_lock (Optional[asyncio.Lock], optional): Lock to use when flushing the queue. Defaults to None. Only used for custom loggers that do batching
"""
self.log_queue: List = []
self.flush_interval = flush_interval or DEFAULT_FLUSH_INTERVAL_SECONDS
self.flush_interval = DEFAULT_FLUSH_INTERVAL_SECONDS # 10 seconds
self.batch_size: int = batch_size or DEFAULT_BATCH_SIZE
self.last_flush_time = time.time()
self.flush_lock = flush_lock

View file

@ -32,11 +32,9 @@ from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.proxy._types import UserAPIKeyAuth
from litellm.types.integrations.datadog import *
from litellm.types.services import ServiceLoggerPayload
from litellm.types.utils import StandardLoggingPayload
from .types import DD_ERRORS, DatadogPayload, DataDogStatus
from .utils import make_json_serializable
DD_MAX_BATCH_SIZE = 1000 # max number of logs DD API can accept
@ -108,20 +106,20 @@ class DataDogLogger(CustomBatchLogger):
verbose_logger.debug(
"Datadog: Logging - Enters logging function for model %s", kwargs
)
await self._log_async_event(kwargs, response_obj, start_time, end_time)
except Exception as e:
verbose_logger.exception(
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
dd_payload = self.create_datadog_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
pass
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
try:
self.log_queue.append(dd_payload)
verbose_logger.debug(
"Datadog: Logging - Enters logging function for model %s", kwargs
f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
)
await self._log_async_event(kwargs, response_obj, start_time, end_time)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
@ -183,14 +181,6 @@ class DataDogLogger(CustomBatchLogger):
verbose_logger.debug(
"Datadog: Logging - Enters logging function for model %s", kwargs
)
if litellm.datadog_use_v1 is True:
dd_payload = self._create_v0_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
else:
dd_payload = self.create_datadog_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
@ -225,22 +215,6 @@ class DataDogLogger(CustomBatchLogger):
pass
pass
async def _log_async_event(self, kwargs, response_obj, start_time, end_time):
dd_payload = self.create_datadog_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
self.log_queue.append(dd_payload)
verbose_logger.debug(
f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
def create_datadog_logging_payload(
self,
kwargs: Union[dict, Any],
@ -262,29 +236,73 @@ class DataDogLogger(CustomBatchLogger):
"""
import json
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if standard_logging_object is None:
raise ValueError("standard_logging_object not found in kwargs")
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
messages = kwargs.get("messages")
optional_params = kwargs.get("optional_params", {})
call_type = kwargs.get("call_type", "litellm.completion")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj["usage"]
id = response_obj.get("id", str(uuid.uuid4()))
usage = dict(usage)
try:
response_time = (end_time - start_time).total_seconds() * 1000
except Exception:
response_time = None
status = DataDogStatus.INFO
if standard_logging_object.get("status") == "failure":
status = DataDogStatus.ERROR
try:
response_obj = dict(response_obj)
except Exception:
response_obj = response_obj
# Clean Metadata before logging - never log raw metadata
# the raw metadata can contain circular references which leads to infinite recursion
# we clean out all extra litellm metadata params before logging
clean_metadata = {}
if isinstance(metadata, dict):
for key, value in metadata.items():
# clean litellm metadata before logging
if key in [
"endpoint",
"caching_groups",
"previous_models",
]:
continue
else:
clean_metadata[key] = value
# Build the initial payload
make_json_serializable(standard_logging_object)
json_payload = json.dumps(standard_logging_object)
payload = {
"id": id,
"call_type": call_type,
"cache_hit": cache_hit,
"start_time": start_time,
"end_time": end_time,
"response_time": response_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"model_parameters": optional_params,
"spend": kwargs.get("response_cost", 0),
"messages": messages,
"response": response_obj,
"usage": usage,
"metadata": clean_metadata,
}
make_json_serializable(payload)
json_payload = json.dumps(payload)
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
dd_payload = DatadogPayload(
ddsource=self._get_datadog_source(),
ddtags=self._get_datadog_tags(),
hostname=self._get_datadog_hostname(),
ddsource=os.getenv("DD_SOURCE", "litellm"),
ddtags="",
hostname="",
message=json_payload,
service=self._get_datadog_service(),
status=status,
service="litellm-server",
status=DataDogStatus.INFO,
)
return dd_payload
@ -364,140 +382,3 @@ class DataDogLogger(CustomBatchLogger):
No user has asked for this so far, this might be spammy on datatdog. If need arises we can implement this
"""
return
async def async_post_call_failure_hook(
self,
request_data: dict,
original_exception: Exception,
user_api_key_dict: UserAPIKeyAuth,
):
"""
Handles Proxy Errors (not-related to LLM API), ex: Authentication Errors
"""
import json
_exception_payload = DatadogProxyFailureHookJsonMessage(
exception=str(original_exception),
error_class=str(original_exception.__class__.__name__),
status_code=getattr(original_exception, "status_code", None),
traceback=traceback.format_exc(),
user_api_key_dict=user_api_key_dict.model_dump(),
)
json_payload = json.dumps(_exception_payload)
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
dd_payload = DatadogPayload(
ddsource=self._get_datadog_source(),
ddtags=self._get_datadog_tags(),
hostname=self._get_datadog_hostname(),
message=json_payload,
service=self._get_datadog_service(),
status=DataDogStatus.ERROR,
)
self.log_queue.append(dd_payload)
def _create_v0_logging_payload(
self,
kwargs: Union[dict, Any],
response_obj: Any,
start_time: datetime.datetime,
end_time: datetime.datetime,
) -> DatadogPayload:
"""
Note: This is our V1 Version of DataDog Logging Payload
(Not Recommended) If you want this to get logged set `litellm.datadog_use_v1 = True`
"""
import json
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
messages = kwargs.get("messages")
optional_params = kwargs.get("optional_params", {})
call_type = kwargs.get("call_type", "litellm.completion")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj["usage"]
id = response_obj.get("id", str(uuid.uuid4()))
usage = dict(usage)
try:
response_time = (end_time - start_time).total_seconds() * 1000
except Exception:
response_time = None
try:
response_obj = dict(response_obj)
except Exception:
response_obj = response_obj
# Clean Metadata before logging - never log raw metadata
# the raw metadata can contain circular references which leads to infinite recursion
# we clean out all extra litellm metadata params before logging
clean_metadata = {}
if isinstance(metadata, dict):
for key, value in metadata.items():
# clean litellm metadata before logging
if key in [
"endpoint",
"caching_groups",
"previous_models",
]:
continue
else:
clean_metadata[key] = value
# Build the initial payload
payload = {
"id": id,
"call_type": call_type,
"cache_hit": cache_hit,
"start_time": start_time,
"end_time": end_time,
"response_time": response_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"model_parameters": optional_params,
"spend": kwargs.get("response_cost", 0),
"messages": messages,
"response": response_obj,
"usage": usage,
"metadata": clean_metadata,
}
make_json_serializable(payload)
json_payload = json.dumps(payload)
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
dd_payload = DatadogPayload(
ddsource=self._get_datadog_source(),
ddtags=self._get_datadog_tags(),
hostname=self._get_datadog_hostname(),
message=json_payload,
service=self._get_datadog_service(),
status=DataDogStatus.INFO,
)
return dd_payload
@staticmethod
def _get_datadog_tags():
return f"env:{os.getenv('DD_ENV', 'unknown')},service:{os.getenv('DD_SERVICE', 'litellm')},version:{os.getenv('DD_VERSION', 'unknown')}"
@staticmethod
def _get_datadog_source():
return os.getenv("DD_SOURCE", "litellm")
@staticmethod
def _get_datadog_service():
return os.getenv("DD_SERVICE", "litellm-server")
@staticmethod
def _get_datadog_hostname():
return ""
@staticmethod
def _get_datadog_env():
return os.getenv("DD_ENV", "unknown")

View file

@ -1,5 +1,5 @@
from enum import Enum
from typing import Optional, TypedDict
from typing import TypedDict
class DataDogStatus(str, Enum):
@ -19,11 +19,3 @@ class DatadogPayload(TypedDict, total=False):
class DD_ERRORS(Enum):
DATADOG_413_ERROR = "Datadog API Error - Payload too large (batch is above 5MB uncompressed). If you want this logged either disable request/response logging or set `DD_BATCH_SIZE=50`"
class DatadogProxyFailureHookJsonMessage(TypedDict, total=False):
exception: str
error_class: str
status_code: Optional[int]
traceback: str
user_api_key_dict: dict

View file

@ -1,4 +1,3 @@
import asyncio
import json
import os
import uuid
@ -11,12 +10,10 @@ from pydantic import BaseModel, Field
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.proxy._types import CommonProxyErrors, SpendLogsMetadata, SpendLogsPayload
from litellm.types.integrations.gcs_bucket import *
from litellm.types.utils import (
StandardCallbackDynamicParams,
StandardLoggingMetadata,
@ -30,8 +27,12 @@ else:
IAM_AUTH_KEY = "IAM_AUTH"
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSLoggingConfig(TypedDict):
bucket_name: str
vertex_instance: VertexBase
path_service_account: Optional[str]
class GCSBucketLogger(GCSBucketBase):
@ -40,21 +41,6 @@ class GCSBucketLogger(GCSBucketBase):
super().__init__(bucket_name=bucket_name)
self.vertex_instances: Dict[str, VertexBase] = {}
# Init Batch logging settings
self.log_queue: List[GCSLogQueueItem] = []
self.batch_size = int(os.getenv("GCS_BATCH_SIZE", GCS_DEFAULT_BATCH_SIZE))
self.flush_interval = int(
os.getenv("GCS_FLUSH_INTERVAL", GCS_DEFAULT_FLUSH_INTERVAL_SECONDS)
)
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
super().__init__(
flush_lock=self.flush_lock,
batch_size=self.batch_size,
flush_interval=self.flush_interval,
)
if premium_user is not True:
raise ValueError(
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
@ -74,23 +60,54 @@ class GCSBucketLogger(GCSBucketBase):
kwargs,
response_obj,
)
gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config(
kwargs
)
headers = await self.construct_request_headers(
vertex_instance=gcs_logging_config["vertex_instance"],
service_account_json=gcs_logging_config["path_service_account"],
)
bucket_name = gcs_logging_config["bucket_name"]
logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if logging_payload is None:
raise ValueError("standard_logging_object not found in kwargs")
# Add to logging queue - this will be flushed periodically
self.log_queue.append(
GCSLogQueueItem(
payload=logging_payload, kwargs=kwargs, response_obj=response_obj
)
)
json_logged_payload = json.dumps(logging_payload, default=str)
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")
# Modify the object_name to include the date-based folder
object_name = f"{current_date}/{response_obj['id']}"
try:
response = await self.async_httpx_client.post(
headers=headers,
url=f"https://storage.googleapis.com/upload/storage/v1/b/{bucket_name}/o?uploadType=media&name={object_name}",
data=json_logged_payload,
)
except httpx.HTTPStatusError as e:
raise Exception(f"GCS Bucket logging error: {e.response.text}")
if response.status_code != 200:
verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
verbose_logger.debug("GCS Bucket response %s", response)
verbose_logger.debug("GCS Bucket status code %s", response.status_code)
verbose_logger.debug("GCS Bucket response.text %s", response.text)
except Exception as e:
verbose_logger.exception(f"GCS Bucket logging error: {str(e)}")
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
from litellm.proxy.proxy_server import premium_user
if premium_user is not True:
raise ValueError(
f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
)
try:
verbose_logger.debug(
"GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s",
@ -98,124 +115,35 @@ class GCSBucketLogger(GCSBucketBase):
response_obj,
)
logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if logging_payload is None:
raise ValueError("standard_logging_object not found in kwargs")
# Add to logging queue - this will be flushed periodically
self.log_queue.append(
GCSLogQueueItem(
payload=logging_payload, kwargs=kwargs, response_obj=response_obj
)
)
except Exception as e:
verbose_logger.exception(f"GCS Bucket logging error: {str(e)}")
async def async_send_batch(self):
"""
Process queued logs in batch - sends logs to GCS Bucket
GCS Bucket does not have a Batch endpoint to batch upload logs
Instead, we
- collect the logs to flush every `GCS_FLUSH_INTERVAL` seconds
- during async_send_batch, we make 1 POST request per log to GCS Bucket
"""
if not self.log_queue:
return
try:
for log_item in self.log_queue:
logging_payload = log_item["payload"]
kwargs = log_item["kwargs"]
response_obj = log_item.get("response_obj", None) or {}
gcs_logging_config: GCSLoggingConfig = (
await self.get_gcs_logging_config(kwargs)
gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config(
kwargs
)
headers = await self.construct_request_headers(
vertex_instance=gcs_logging_config["vertex_instance"],
service_account_json=gcs_logging_config["path_service_account"],
)
bucket_name = gcs_logging_config["bucket_name"]
object_name = self._get_object_name(
kwargs, logging_payload, response_obj
)
await self._log_json_data_on_gcs(
headers=headers,
bucket_name=bucket_name,
object_name=object_name,
logging_payload=logging_payload,
logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
# Clear the queue after processing
self.log_queue.clear()
if logging_payload is None:
raise ValueError("standard_logging_object not found in kwargs")
except Exception as e:
verbose_logger.exception(f"GCS Bucket batch logging error: {str(e)}")
_litellm_params = kwargs.get("litellm_params") or {}
metadata = _litellm_params.get("metadata") or {}
def _get_object_name(
self, kwargs: Dict, logging_payload: StandardLoggingPayload, response_obj: Any
) -> str:
"""
Get the object name to use for the current payload
"""
current_date = datetime.now().strftime("%Y-%m-%d")
if logging_payload.get("error_str", None) is not None:
object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
else:
object_name = f"{current_date}/{response_obj.get('id', '')}"
# used for testing
_litellm_params = kwargs.get("litellm_params", None) or {}
_metadata = _litellm_params.get("metadata", None) or {}
if "gcs_log_id" in _metadata:
object_name = _metadata["gcs_log_id"]
return object_name
def _handle_folders_in_bucket_name(
self,
bucket_name: str,
object_name: str,
) -> Tuple[str, str]:
"""
Handles when the user passes a bucket name with a folder postfix
Example:
- Bucket name: "my-bucket/my-folder/dev"
- Object name: "my-object"
- Returns: bucket_name="my-bucket", object_name="my-folder/dev/my-object"
"""
if "/" in bucket_name:
bucket_name, prefix = bucket_name.split("/", 1)
object_name = f"{prefix}/{object_name}"
return bucket_name, object_name
return bucket_name, object_name
async def _log_json_data_on_gcs(
self,
headers: Dict[str, str],
bucket_name: str,
object_name: str,
logging_payload: StandardLoggingPayload,
):
"""
Helper function to make POST request to GCS Bucket in the specified bucket.
"""
json_logged_payload = json.dumps(logging_payload, default=str)
bucket_name, object_name = self._handle_folders_in_bucket_name(
bucket_name=bucket_name,
object_name=object_name,
)
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")
# Modify the object_name to include the date-based folder
object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
if "gcs_log_id" in metadata:
object_name = metadata["gcs_log_id"]
response = await self.async_httpx_client.post(
headers=headers,
@ -229,6 +157,8 @@ class GCSBucketLogger(GCSBucketBase):
verbose_logger.debug("GCS Bucket response %s", response)
verbose_logger.debug("GCS Bucket status code %s", response.status_code)
verbose_logger.debug("GCS Bucket response.text %s", response.text)
except Exception as e:
verbose_logger.exception(f"GCS Bucket logging error: {str(e)}")
async def get_gcs_logging_config(
self, kwargs: Optional[Dict[str, Any]] = {}
@ -337,11 +267,6 @@ class GCSBucketLogger(GCSBucketBase):
service_account_json=gcs_logging_config["path_service_account"],
)
bucket_name = gcs_logging_config["bucket_name"]
bucket_name, object_name = self._handle_folders_in_bucket_name(
bucket_name=bucket_name,
object_name=object_name,
)
url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{object_name}?alt=media"
# Send the GET request to download the object
@ -377,11 +302,6 @@ class GCSBucketLogger(GCSBucketBase):
service_account_json=gcs_logging_config["path_service_account"],
)
bucket_name = gcs_logging_config["bucket_name"]
bucket_name, object_name = self._handle_folders_in_bucket_name(
bucket_name=bucket_name,
object_name=object_name,
)
url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{object_name}"
# Send the DELETE request to delete the object

View file

@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
@ -21,8 +21,8 @@ else:
VertexBase = Any
class GCSBucketBase(CustomBatchLogger):
def __init__(self, bucket_name: Optional[str] = None, **kwargs) -> None:
class GCSBucketBase(CustomLogger):
def __init__(self, bucket_name: Optional[str] = None) -> None:
self.async_httpx_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
@ -30,7 +30,6 @@ class GCSBucketBase(CustomBatchLogger):
_bucket_name = bucket_name or os.getenv("GCS_BUCKET_NAME")
self.path_service_account_json: Optional[str] = _path_service_account
self.BUCKET_NAME: Optional[str] = _bucket_name
super().__init__(**kwargs)
async def construct_request_headers(
self,

View file

@ -3,9 +3,8 @@
import copy
import os
import traceback
import types
from collections.abc import MutableMapping, MutableSequence, MutableSet
from typing import TYPE_CHECKING, Any, Dict, Optional, cast
from typing import TYPE_CHECKING, Any, Dict, Optional
from packaging.version import Version
from pydantic import BaseModel
@ -356,28 +355,17 @@ class LangFuseLogger:
)
)
def is_base_type(self, value: Any) -> bool:
# Check if the value is of a base type
base_types = (int, float, str, bool, list, dict, tuple)
return isinstance(value, base_types)
def _prepare_metadata(self, metadata: Optional[dict]) -> Any:
def _prepare_metadata(self, metadata) -> Any:
try:
if metadata is None:
return None
# Filter out function types from the metadata
sanitized_metadata = {k: v for k, v in metadata.items() if not callable(v)}
return copy.deepcopy(sanitized_metadata)
except Exception as e:
verbose_logger.debug(f"Langfuse Layer Error - {e}, metadata: {metadata}")
return copy.deepcopy(metadata) # Avoid modifying the original metadata
except (TypeError, copy.Error) as e:
verbose_logger.warning(f"Langfuse Layer Error - {e}")
new_metadata: Dict[str, Any] = {}
# if metadata is not a MutableMapping, return an empty dict since we can't call items() on it
if not isinstance(metadata, MutableMapping):
verbose_logger.debug(
verbose_logger.warning(
"Langfuse Layer Logging - metadata is not a MutableMapping, returning empty dict"
)
return new_metadata
@ -385,40 +373,25 @@ class LangFuseLogger:
for key, value in metadata.items():
try:
if isinstance(value, MutableMapping):
new_metadata[key] = self._prepare_metadata(cast(dict, value))
elif isinstance(value, MutableSequence):
# For lists or other mutable sequences
new_metadata[key] = list(
new_metadata[key] = self._prepare_metadata(value)
elif isinstance(value, (MutableSequence, MutableSet)):
new_metadata[key] = type(value)(
*(
(
self._prepare_metadata(cast(dict, v))
self._prepare_metadata(v)
if isinstance(v, MutableMapping)
else copy.deepcopy(v)
)
for v in value
)
elif isinstance(value, MutableSet):
# For sets specifically, create a new set by passing an iterable
new_metadata[key] = set(
(
self._prepare_metadata(cast(dict, v))
if isinstance(v, MutableMapping)
else copy.deepcopy(v)
)
for v in value
)
elif isinstance(value, BaseModel):
new_metadata[key] = value.model_dump()
elif self.is_base_type(value):
new_metadata[key] = value
else:
verbose_logger.debug(
f"Langfuse Layer Error - Unsupported metadata type: {type(value)} for key: {key}"
)
continue
new_metadata[key] = copy.deepcopy(value)
except (TypeError, copy.Error):
verbose_logger.debug(
f"Langfuse Layer Error - Couldn't copy metadata key: {key}, type of key: {type(key)}, type of value: {type(value)} - {traceback.format_exc()}"
verbose_logger.warning(
f"Langfuse Layer Error - Couldn't copy metadata key: {key} - {traceback.format_exc()}"
)
return new_metadata

View file

@ -23,8 +23,34 @@ from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.langsmith import *
from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload
from litellm.types.utils import StandardLoggingPayload
class LangsmithInputs(BaseModel):
model: Optional[str] = None
messages: Optional[List[Any]] = None
stream: Optional[bool] = None
call_type: Optional[str] = None
litellm_call_id: Optional[str] = None
completion_start_time: Optional[datetime] = None
temperature: Optional[float] = None
max_tokens: Optional[int] = None
custom_llm_provider: Optional[str] = None
input: Optional[List[Any]] = None
log_event_type: Optional[str] = None
original_response: Optional[Any] = None
response_cost: Optional[float] = None
# LiteLLM Virtual Key specific fields
user_api_key: Optional[str] = None
user_api_key_user_id: Optional[str] = None
user_api_key_team_alias: Optional[str] = None
class LangsmithCredentialsObject(TypedDict):
LANGSMITH_API_KEY: str
LANGSMITH_PROJECT: str
LANGSMITH_BASE_URL: str
def is_serializable(value):
@ -67,16 +93,15 @@ class LangsmithLogger(CustomBatchLogger):
)
if _batch_size:
self.batch_size = int(_batch_size)
self.log_queue: List[LangsmithQueueObject] = []
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
super().__init__(**kwargs, flush_lock=self.flush_lock)
def get_credentials_from_env(
self,
langsmith_api_key: Optional[str] = None,
langsmith_project: Optional[str] = None,
langsmith_base_url: Optional[str] = None,
langsmith_api_key: Optional[str],
langsmith_project: Optional[str],
langsmith_base_url: Optional[str],
) -> LangsmithCredentialsObject:
_credentials_api_key = langsmith_api_key or os.getenv("LANGSMITH_API_KEY")
@ -107,19 +132,42 @@ class LangsmithLogger(CustomBatchLogger):
LANGSMITH_PROJECT=_credentials_project,
)
def _prepare_log_data(
self,
kwargs,
response_obj,
start_time,
end_time,
credentials: LangsmithCredentialsObject,
def _prepare_log_data( # noqa: PLR0915
self, kwargs, response_obj, start_time, end_time
):
import json
from datetime import datetime as dt
try:
_litellm_params = kwargs.get("litellm_params", {}) or {}
metadata = _litellm_params.get("metadata", {}) or {}
new_metadata = {}
for key, value in metadata.items():
if (
isinstance(value, list)
or isinstance(value, str)
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = value
elif isinstance(value, BaseModel):
new_metadata[key] = value.model_dump_json()
elif isinstance(value, dict):
for k, v in value.items():
if isinstance(v, dt):
value[k] = v.isoformat()
new_metadata[key] = value
metadata = new_metadata
kwargs["user_api_key"] = metadata.get("user_api_key", None)
kwargs["user_api_key_user_id"] = metadata.get("user_api_key_user_id", None)
kwargs["user_api_key_team_alias"] = metadata.get(
"user_api_key_team_alias", None
)
project_name = metadata.get(
"project_name", credentials["LANGSMITH_PROJECT"]
"project_name", self.default_credentials["LANGSMITH_PROJECT"]
)
run_name = metadata.get("run_name", self.langsmith_default_run_name)
run_id = metadata.get("id", None)
@ -127,10 +175,16 @@ class LangsmithLogger(CustomBatchLogger):
trace_id = metadata.get("trace_id", None)
session_id = metadata.get("session_id", None)
dotted_order = metadata.get("dotted_order", None)
tags = metadata.get("tags", []) or []
verbose_logger.debug(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
)
# filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs
# logged_kwargs = LangsmithInputs(**kwargs)
# kwargs = logged_kwargs.model_dump()
# new_kwargs = {}
# Ensure everything in the payload is converted to str
payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
@ -139,6 +193,7 @@ class LangsmithLogger(CustomBatchLogger):
if payload is None:
raise Exception("Error logging request payload. Payload=none.")
new_kwargs = payload
metadata = payload[
"metadata"
] # ensure logged metadata is json serializable
@ -146,12 +201,12 @@ class LangsmithLogger(CustomBatchLogger):
data = {
"name": run_name,
"run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
"inputs": payload,
"outputs": payload["response"],
"inputs": new_kwargs,
"outputs": new_kwargs["response"],
"session_name": project_name,
"start_time": payload["startTime"],
"end_time": payload["endTime"],
"tags": payload["request_tags"],
"start_time": new_kwargs["startTime"],
"end_time": new_kwargs["endTime"],
"tags": tags,
"extra": metadata,
}
@ -188,6 +243,37 @@ class LangsmithLogger(CustomBatchLogger):
except Exception:
raise
def _send_batch(self):
if not self.log_queue:
return
langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"]
langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"]
url = f"{langsmith_api_base}/runs/batch"
headers = {"x-api-key": langsmith_api_key}
try:
response = requests.post(
url=url,
json=self.log_queue,
headers=headers,
)
if response.status_code >= 300:
verbose_logger.error(
f"Langsmith Error: {response.status_code} - {response.text}"
)
else:
verbose_logger.debug(
f"Batch of {len(self.log_queue)} runs successfully created"
)
self.log_queue.clear()
except Exception:
verbose_logger.exception("Langsmith Layer Error - Error sending batch.")
def log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
sampling_rate = (
@ -209,20 +295,8 @@ class LangsmithLogger(CustomBatchLogger):
kwargs,
response_obj,
)
credentials = self._get_credentials_to_use_for_request(kwargs=kwargs)
data = self._prepare_log_data(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
credentials=credentials,
)
self.log_queue.append(
LangsmithQueueObject(
data=data,
credentials=credentials,
)
)
data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
self.log_queue.append(data)
verbose_logger.debug(
f"Langsmith, event added to queue. Will flush in {self.flush_interval} seconds..."
)
@ -249,20 +323,8 @@ class LangsmithLogger(CustomBatchLogger):
kwargs,
response_obj,
)
credentials = self._get_credentials_to_use_for_request(kwargs=kwargs)
data = self._prepare_log_data(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
credentials=credentials,
)
self.log_queue.append(
LangsmithQueueObject(
data=data,
credentials=credentials,
)
)
data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
self.log_queue.append(data)
verbose_logger.debug(
"Langsmith logging: queue length %s, batch size %s",
len(self.log_queue),
@ -287,20 +349,8 @@ class LangsmithLogger(CustomBatchLogger):
return # Skip logging
verbose_logger.info("Langsmith Failure Event Logging!")
try:
credentials = self._get_credentials_to_use_for_request(kwargs=kwargs)
data = self._prepare_log_data(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
credentials=credentials,
)
self.log_queue.append(
LangsmithQueueObject(
data=data,
credentials=credentials,
)
)
data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
self.log_queue.append(data)
verbose_logger.debug(
"Langsmith logging: queue length %s, batch size %s",
len(self.log_queue),
@ -315,58 +365,31 @@ class LangsmithLogger(CustomBatchLogger):
async def async_send_batch(self):
"""
Handles sending batches of runs to Langsmith
sends runs to /batch endpoint
self.log_queue contains LangsmithQueueObjects
Each LangsmithQueueObject has the following:
- "credentials" - credentials to use for the request (langsmith_api_key, langsmith_project, langsmith_base_url)
- "data" - data to log on to langsmith for the request
This function
- groups the queue objects by credentials
- loops through each unique credentials and sends batches to Langsmith
This was added to support key/team based logging on langsmith
"""
if not self.log_queue:
return
batch_groups = self._group_batches_by_credentials()
for batch_group in batch_groups.values():
await self._log_batch_on_langsmith(
credentials=batch_group.credentials,
queue_objects=batch_group.queue_objects,
)
async def _log_batch_on_langsmith(
self,
credentials: LangsmithCredentialsObject,
queue_objects: List[LangsmithQueueObject],
):
"""
Logs a batch of runs to Langsmith
sends runs to /batch endpoint for the given credentials
Args:
credentials: LangsmithCredentialsObject
queue_objects: List[LangsmithQueueObject]
Sends runs from self.log_queue
Returns: None
Raises: Does not raise an exception, will only verbose_logger.exception()
"""
langsmith_api_base = credentials["LANGSMITH_BASE_URL"]
langsmith_api_key = credentials["LANGSMITH_API_KEY"]
if not self.log_queue:
return
langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"]
url = f"{langsmith_api_base}/runs/batch"
langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"]
headers = {"x-api-key": langsmith_api_key}
elements_to_log = [queue_object["data"] for queue_object in queue_objects]
try:
response = await self.async_httpx_client.post(
url=url,
json={"post": elements_to_log},
json={
"post": self.log_queue,
},
headers=headers,
)
response.raise_for_status()
@ -388,74 +411,6 @@ class LangsmithLogger(CustomBatchLogger):
f"Langsmith Layer Error - {traceback.format_exc()}"
)
def _group_batches_by_credentials(self) -> Dict[CredentialsKey, BatchGroup]:
"""Groups queue objects by credentials using a proper key structure"""
log_queue_by_credentials: Dict[CredentialsKey, BatchGroup] = {}
for queue_object in self.log_queue:
credentials = queue_object["credentials"]
key = CredentialsKey(
api_key=credentials["LANGSMITH_API_KEY"],
project=credentials["LANGSMITH_PROJECT"],
base_url=credentials["LANGSMITH_BASE_URL"],
)
if key not in log_queue_by_credentials:
log_queue_by_credentials[key] = BatchGroup(
credentials=credentials, queue_objects=[]
)
log_queue_by_credentials[key].queue_objects.append(queue_object)
return log_queue_by_credentials
def _get_credentials_to_use_for_request(
self, kwargs: Dict[str, Any]
) -> LangsmithCredentialsObject:
"""
Handles key/team based logging
If standard_callback_dynamic_params are provided, use those credentials.
Otherwise, use the default credentials.
"""
standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = (
kwargs.get("standard_callback_dynamic_params", None)
)
if standard_callback_dynamic_params is not None:
credentials = self.get_credentials_from_env(
langsmith_api_key=standard_callback_dynamic_params.get(
"langsmith_api_key", None
),
langsmith_project=standard_callback_dynamic_params.get(
"langsmith_project", None
),
langsmith_base_url=standard_callback_dynamic_params.get(
"langsmith_base_url", None
),
)
else:
credentials = self.default_credentials
return credentials
def _send_batch(self):
"""Calls async_send_batch in an event loop"""
if not self.log_queue:
return
try:
# Try to get the existing event loop
loop = asyncio.get_event_loop()
if loop.is_running():
# If we're already in an event loop, create a task
asyncio.create_task(self.async_send_batch())
else:
# If no event loop is running, run the coroutine directly
loop.run_until_complete(self.async_send_batch())
except RuntimeError:
# If we can't get an event loop, create a new one
asyncio.run(self.async_send_batch())
def get_run_by_id(self, run_id):
langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"]

View file

@ -1,247 +0,0 @@
import json
import threading
from typing import Optional
from litellm._logging import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
class MlflowLogger(CustomLogger):
def __init__(self):
from mlflow.tracking import MlflowClient
self._client = MlflowClient()
self._stream_id_to_span = {}
self._lock = threading.Lock() # lock for _stream_id_to_span
def log_success_event(self, kwargs, response_obj, start_time, end_time):
self._handle_success(kwargs, response_obj, start_time, end_time)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
self._handle_success(kwargs, response_obj, start_time, end_time)
def _handle_success(self, kwargs, response_obj, start_time, end_time):
"""
Log the success event as an MLflow span.
Note that this method is called asynchronously in the background thread.
"""
from mlflow.entities import SpanStatusCode
try:
verbose_logger.debug("MLflow logging start for success event")
if kwargs.get("stream"):
self._handle_stream_event(kwargs, response_obj, start_time, end_time)
else:
span = self._start_span_or_trace(kwargs, start_time)
end_time_ns = int(end_time.timestamp() * 1e9)
self._end_span_or_trace(
span=span,
outputs=response_obj,
status=SpanStatusCode.OK,
end_time_ns=end_time_ns,
)
except Exception:
verbose_logger.debug("MLflow Logging Error", stack_info=True)
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
self._handle_failure(kwargs, response_obj, start_time, end_time)
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
self._handle_failure(kwargs, response_obj, start_time, end_time)
def _handle_failure(self, kwargs, response_obj, start_time, end_time):
"""
Log the failure event as an MLflow span.
Note that this method is called *synchronously* unlike the success handler.
"""
from mlflow.entities import SpanEvent, SpanStatusCode
try:
span = self._start_span_or_trace(kwargs, start_time)
end_time_ns = int(end_time.timestamp() * 1e9)
# Record exception info as event
if exception := kwargs.get("exception"):
span.add_event(SpanEvent.from_exception(exception))
self._end_span_or_trace(
span=span,
outputs=response_obj,
status=SpanStatusCode.ERROR,
end_time_ns=end_time_ns,
)
except Exception as e:
verbose_logger.debug(f"MLflow Logging Error - {e}", stack_info=True)
def _handle_stream_event(self, kwargs, response_obj, start_time, end_time):
"""
Handle the success event for a streaming response. For streaming calls,
log_success_event handle is triggered for every chunk of the stream.
We create a single span for the entire stream request as follows:
1. For the first chunk, start a new span and store it in the map.
2. For subsequent chunks, add the chunk as an event to the span.
3. For the final chunk, end the span and remove the span from the map.
"""
from mlflow.entities import SpanStatusCode
litellm_call_id = kwargs.get("litellm_call_id")
if litellm_call_id not in self._stream_id_to_span:
with self._lock:
# Check again after acquiring lock
if litellm_call_id not in self._stream_id_to_span:
# Start a new span for the first chunk of the stream
span = self._start_span_or_trace(kwargs, start_time)
self._stream_id_to_span[litellm_call_id] = span
# Add chunk as event to the span
span = self._stream_id_to_span[litellm_call_id]
self._add_chunk_events(span, response_obj)
# If this is the final chunk, end the span. The final chunk
# has complete_streaming_response that gathers the full response.
if final_response := kwargs.get("complete_streaming_response"):
end_time_ns = int(end_time.timestamp() * 1e9)
self._end_span_or_trace(
span=span,
outputs=final_response,
status=SpanStatusCode.OK,
end_time_ns=end_time_ns,
)
# Remove the stream_id from the map
with self._lock:
self._stream_id_to_span.pop(litellm_call_id)
def _add_chunk_events(self, span, response_obj):
from mlflow.entities import SpanEvent
try:
for choice in response_obj.choices:
span.add_event(
SpanEvent(
name="streaming_chunk",
attributes={"delta": json.dumps(choice.delta.model_dump())},
)
)
except Exception:
verbose_logger.debug("Error adding chunk events to span", stack_info=True)
def _construct_input(self, kwargs):
"""Construct span inputs with optional parameters"""
inputs = {"messages": kwargs.get("messages")}
for key in ["functions", "tools", "stream", "tool_choice", "user"]:
if value := kwargs.get("optional_params", {}).pop(key, None):
inputs[key] = value
return inputs
def _extract_attributes(self, kwargs):
"""
Extract span attributes from kwargs.
With the latest version of litellm, the standard_logging_object contains
canonical information for logging. If it is not present, we extract
subset of attributes from other kwargs.
"""
attributes = {
"litellm_call_id": kwargs.get("litellm_call_id"),
"call_type": kwargs.get("call_type"),
"model": kwargs.get("model"),
}
standard_obj = kwargs.get("standard_logging_object")
if standard_obj:
attributes.update(
{
"api_base": standard_obj.get("api_base"),
"cache_hit": standard_obj.get("cache_hit"),
"usage": {
"completion_tokens": standard_obj.get("completion_tokens"),
"prompt_tokens": standard_obj.get("prompt_tokens"),
"total_tokens": standard_obj.get("total_tokens"),
},
"raw_llm_response": standard_obj.get("response"),
"response_cost": standard_obj.get("response_cost"),
"saved_cache_cost": standard_obj.get("saved_cache_cost"),
}
)
else:
litellm_params = kwargs.get("litellm_params", {})
attributes.update(
{
"model": kwargs.get("model"),
"cache_hit": kwargs.get("cache_hit"),
"custom_llm_provider": kwargs.get("custom_llm_provider"),
"api_base": litellm_params.get("api_base"),
"response_cost": kwargs.get("response_cost"),
}
)
return attributes
def _get_span_type(self, call_type: Optional[str]) -> str:
from mlflow.entities import SpanType
if call_type in ["completion", "acompletion"]:
return SpanType.LLM
elif call_type == "embeddings":
return SpanType.EMBEDDING
else:
return SpanType.LLM
def _start_span_or_trace(self, kwargs, start_time):
"""
Start an MLflow span or a trace.
If there is an active span, we start a new span as a child of
that span. Otherwise, we start a new trace.
"""
import mlflow
call_type = kwargs.get("call_type", "completion")
span_name = f"litellm-{call_type}"
span_type = self._get_span_type(call_type)
start_time_ns = int(start_time.timestamp() * 1e9)
inputs = self._construct_input(kwargs)
attributes = self._extract_attributes(kwargs)
if active_span := mlflow.get_current_active_span(): # type: ignore
return self._client.start_span(
name=span_name,
request_id=active_span.request_id,
parent_id=active_span.span_id,
span_type=span_type,
inputs=inputs,
attributes=attributes,
start_time_ns=start_time_ns,
)
else:
return self._client.start_trace(
name=span_name,
span_type=span_type,
inputs=inputs,
attributes=attributes,
start_time_ns=start_time_ns,
)
def _end_span_or_trace(self, span, outputs, end_time_ns, status):
"""End an MLflow span or a trace."""
if span.parent_id is None:
self._client.end_trace(
request_id=span.request_id,
outputs=outputs,
status=status,
end_time_ns=end_time_ns,
)
else:
self._client.end_span(
request_id=span.request_id,
span_id=span.span_id,
outputs=outputs,
status=status,
end_time_ns=end_time_ns,
)

View file

@ -2,23 +2,20 @@ import os
from dataclasses import dataclass
from datetime import datetime
from functools import wraps
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.services import ServiceLoggerPayload
from litellm.types.utils import (
ChatCompletionMessageToolCall,
EmbeddingResponse,
Function,
ImageResponse,
ModelResponse,
StandardLoggingPayload,
)
if TYPE_CHECKING:
from opentelemetry.sdk.trace.export import SpanExporter as _SpanExporter
from opentelemetry.trace import Span as _Span
from litellm.proxy._types import (
@ -27,12 +24,10 @@ if TYPE_CHECKING:
from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
Span = _Span
SpanExporter = _SpanExporter
UserAPIKeyAuth = _UserAPIKeyAuth
ManagementEndpointLoggingPayload = _ManagementEndpointLoggingPayload
else:
Span = Any
SpanExporter = Any
UserAPIKeyAuth = Any
ManagementEndpointLoggingPayload = Any
@ -49,6 +44,7 @@ LITELLM_REQUEST_SPAN_NAME = "litellm_request"
@dataclass
class OpenTelemetryConfig:
from opentelemetry.sdk.trace.export import SpanExporter
exporter: Union[str, SpanExporter] = "console"
endpoint: Optional[str] = None
@ -81,7 +77,7 @@ class OpenTelemetryConfig:
class OpenTelemetry(CustomLogger):
def __init__(
self,
config: Optional[OpenTelemetryConfig] = None,
config: OpenTelemetryConfig = OpenTelemetryConfig.from_env(),
callback_name: Optional[str] = None,
**kwargs,
):
@ -89,9 +85,6 @@ class OpenTelemetry(CustomLogger):
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
if config is None:
config = OpenTelemetryConfig.from_env()
self.config = config
self.OTEL_EXPORTER = self.config.exporter
self.OTEL_ENDPOINT = self.config.endpoint
@ -326,8 +319,8 @@ class OpenTelemetry(CustomLogger):
span.end(end_time=self._to_ns(end_time))
if parent_otel_span is not None:
parent_otel_span.end(end_time=self._to_ns(datetime.now()))
# if parent_otel_span is not None:
# parent_otel_span.end(end_time=self._to_ns(datetime.now()))
def _handle_failure(self, kwargs, response_obj, start_time, end_time):
from opentelemetry.trace import Status, StatusCode
@ -405,28 +398,6 @@ class OpenTelemetry(CustomLogger):
except Exception:
return ""
@staticmethod
def _tool_calls_kv_pair(
tool_calls: List[ChatCompletionMessageToolCall],
) -> Dict[str, Any]:
from litellm.proxy._types import SpanAttributes
kv_pairs: Dict[str, Any] = {}
for idx, tool_call in enumerate(tool_calls):
_function = tool_call.get("function")
if not _function:
continue
keys = Function.__annotations__.keys()
for key in keys:
_value = _function.get(key)
if _value:
kv_pairs[
f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.{key}"
] = _value
return kv_pairs
def set_attributes( # noqa: PLR0915
self, span: Span, kwargs, response_obj: Optional[Any]
):
@ -621,12 +592,17 @@ class OpenTelemetry(CustomLogger):
message = choice.get("message")
tool_calls = message.get("tool_calls")
if tool_calls:
kv_pairs = OpenTelemetry._tool_calls_kv_pair(tool_calls) # type: ignore
for key, value in kv_pairs.items():
self.safe_set_attribute(
span=span,
key=key,
value=value,
key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
value=tool_calls[0].get("function").get("name"),
)
self.safe_set_attribute(
span=span,
key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
value=tool_calls[0]
.get("function")
.get("arguments"),
)
except Exception as e:
@ -724,10 +700,10 @@ class OpenTelemetry(CustomLogger):
TraceContextTextMapPropagator,
)
verbose_logger.debug("OpenTelemetry: GOT A TRACEPARENT {}".format(_traceparent))
propagator = TraceContextTextMapPropagator()
carrier = {"traceparent": _traceparent}
_parent_context = propagator.extract(carrier=carrier)
_parent_context = propagator.extract(carrier={"traceparent": _traceparent})
verbose_logger.debug("OpenTelemetry: PARENT CONTEXT {}".format(_parent_context))
return _parent_context
def _get_span_context(self, kwargs):

View file

@ -18,7 +18,6 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy._types import UserAPIKeyAuth
from litellm.types.integrations.prometheus import *
from litellm.types.utils import StandardLoggingPayload
from litellm.utils import get_end_user_id_for_cost_tracking
class PrometheusLogger(CustomLogger):
@ -229,13 +228,6 @@ class PrometheusLogger(CustomLogger):
"api_key_alias",
],
)
# llm api provider budget metrics
self.litellm_provider_remaining_budget_metric = Gauge(
"litellm_provider_remaining_budget_metric",
"Remaining budget for provider - used when you set provider budget limits",
labelnames=["api_provider"],
)
# Get all keys
_logged_llm_labels = [
"litellm_model_name",
@ -365,7 +357,8 @@ class PrometheusLogger(CustomLogger):
model = kwargs.get("model", "")
litellm_params = kwargs.get("litellm_params", {}) or {}
_metadata = litellm_params.get("metadata", {})
end_user_id = get_end_user_id_for_cost_tracking(litellm_params)
proxy_server_request = litellm_params.get("proxy_server_request") or {}
end_user_id = proxy_server_request.get("body", {}).get("user", None)
user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@ -664,11 +657,13 @@ class PrometheusLogger(CustomLogger):
# unpack kwargs
model = kwargs.get("model", "")
litellm_params = kwargs.get("litellm_params", {}) or {}
standard_logging_payload: StandardLoggingPayload = kwargs.get(
"standard_logging_object", {}
)
litellm_params = kwargs.get("litellm_params", {}) or {}
end_user_id = get_end_user_id_for_cost_tracking(litellm_params)
proxy_server_request = litellm_params.get("proxy_server_request") or {}
end_user_id = proxy_server_request.get("body", {}).get("user", None)
user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@ -1135,19 +1130,6 @@ class PrometheusLogger(CustomLogger):
litellm_model_name, model_id, api_base, api_provider, exception_status
).inc()
def track_provider_remaining_budget(
self, provider: str, spend: float, budget_limit: float
):
"""
Track provider remaining budget in Prometheus
"""
self.litellm_provider_remaining_budget_metric.labels(provider).set(
self._safe_get_remaining_budget(
max_budget=budget_limit,
spend=spend,
)
)
def _safe_get_remaining_budget(
self, max_budget: Optional[float], spend: Optional[float]
) -> float:

View file

@ -1,12 +0,0 @@
## Folder Contents
This folder contains general-purpose utilities that are used in multiple places in the codebase.
Core files:
- `streaming_handler.py`: The core streaming logic + streaming related helper utils
- `core_helpers.py`: code used in `types/` - e.g. `map_finish_reason`.
- `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types.
- `default_encoding.py`: code for loading the default encoding (tiktoken)
- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name.
- `duration_parser.py`: code for parsing durations - e.g. "1d", "1mo", "10s"

View file

@ -3,8 +3,6 @@
import os
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union
import httpx
from litellm._logging import verbose_logger
if TYPE_CHECKING:
@ -101,28 +99,3 @@ def _get_parent_otel_span_from_kwargs(
"Error in _get_parent_otel_span_from_kwargs: " + str(e)
)
return None
def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict:
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
openai_headers = {}
processed_headers = {}
additional_headers = {}
for k, v in response_headers.items():
if k in OPENAI_RESPONSE_HEADERS: # return openai-compatible headers
openai_headers[k] = v
if k.startswith(
"llm_provider-"
): # return raw provider headers (incl. openai-compatible ones)
processed_headers[k] = v
else:
additional_headers["{}-{}".format("llm_provider", k)] = v
additional_headers = {
**openai_headers,
**processed_headers,
**additional_headers,
}
return additional_headers

View file

@ -1,21 +0,0 @@
import os
import litellm
try:
# New and recommended way to access resources
from importlib import resources
filename = str(resources.files(litellm).joinpath("llms/tokenizers"))
except (ImportError, AttributeError):
# Old way to access resources, which setuptools deprecated some time ago
import pkg_resources # type: ignore
filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")
os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
"CUSTOM_TIKTOKEN_CACHE_DIR", filename
) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

View file

@ -1,92 +0,0 @@
"""
Helper utilities for parsing durations - 1s, 1d, 10d, 30d, 1mo, 2mo
duration_in_seconds is used in diff parts of the code base, example
- Router - Provider budget routing
- Proxy - Key, Team Generation
"""
import re
import time
from datetime import datetime, timedelta
from typing import Tuple
def _extract_from_regex(duration: str) -> Tuple[int, str]:
match = re.match(r"(\d+)(mo|[smhd]?)", duration)
if not match:
raise ValueError("Invalid duration format")
value, unit = match.groups()
value = int(value)
return value, unit
def get_last_day_of_month(year, month):
# Handle December case
if month == 12:
return 31
# Next month is January, so subtract a day from March 1st
next_month = datetime(year=year, month=month + 1, day=1)
last_day_of_month = (next_month - timedelta(days=1)).day
return last_day_of_month
def duration_in_seconds(duration: str) -> int:
"""
Parameters:
- duration:
- "<number>s" - seconds
- "<number>m" - minutes
- "<number>h" - hours
- "<number>d" - days
- "<number>mo" - months
Returns time in seconds till when budget needs to be reset
"""
value, unit = _extract_from_regex(duration=duration)
if unit == "s":
return value
elif unit == "m":
return value * 60
elif unit == "h":
return value * 3600
elif unit == "d":
return value * 86400
elif unit == "mo":
now = time.time()
current_time = datetime.fromtimestamp(now)
if current_time.month == 12:
target_year = current_time.year + 1
target_month = 1
else:
target_year = current_time.year
target_month = current_time.month + value
# Determine the day to set for next month
target_day = current_time.day
last_day_of_target_month = get_last_day_of_month(target_year, target_month)
if target_day > last_day_of_target_month:
target_day = last_day_of_target_month
next_month = datetime(
year=target_year,
month=target_month,
day=target_day,
hour=current_time.hour,
minute=current_time.minute,
second=current_time.second,
microsecond=current_time.microsecond,
)
# Calculate the duration until the first day of the next month
duration_until_next_month = next_month - current_time
return int(duration_until_next_month.total_seconds())
else:
raise ValueError(f"Unsupported duration unit, passed duration: {duration}")

View file

@ -239,7 +239,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ContextWindowExceededError: {exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif (
@ -251,7 +251,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif "A timeout occurred" in error_str:
@ -271,7 +271,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ContentPolicyViolationError: {exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif (
@ -283,7 +283,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif "Web server is returning an unknown error" in error_str:
@ -299,7 +299,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"RateLimitError: {exception_provider} - {message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif (
@ -311,7 +311,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AuthenticationError: {exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif "Mistral API raised a streaming error" in error_str:
@ -335,7 +335,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 401:
@ -344,7 +344,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AuthenticationError: {exception_provider} - {message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 404:
@ -353,7 +353,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NotFoundError: {exception_provider} - {message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 408:
@ -516,7 +516,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {error_str}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "input is too long" in error_str:
exception_mapping_worked = True
@ -524,7 +524,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {error_str}",
model=model,
llm_provider="replicate",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif exception_type == "ModelError":
exception_mapping_worked = True
@ -532,7 +532,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {error_str}",
model=model,
llm_provider="replicate",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Request was throttled" in error_str:
exception_mapping_worked = True
@ -540,7 +540,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {error_str}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif hasattr(original_exception, "status_code"):
if original_exception.status_code == 401:
@ -549,7 +549,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
original_exception.status_code == 400
@ -560,7 +560,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
model=model,
llm_provider="replicate",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 422:
exception_mapping_worked = True
@ -568,7 +568,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
model=model,
llm_provider="replicate",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -583,7 +583,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
@ -591,7 +591,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 500:
exception_mapping_worked = True
@ -599,7 +599,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ReplicateException - {original_exception.message}",
llm_provider="replicate",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
exception_mapping_worked = True
raise APIError(
@ -631,7 +631,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{custom_llm_provider}Exception: Authentication Error - {error_str}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif "token_quota_reached" in error_str:
@ -640,7 +640,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{custom_llm_provider}Exception: Rate Limit Errror - {error_str}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"The server received an invalid response from an upstream server."
@ -750,7 +750,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {error_str}\n. Enable 'litellm.modify_params=True' (for PROXY do: `litellm_settings::modify_params: True`) to insert a dummy assistant message and fix this error.",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Malformed input request" in error_str:
exception_mapping_worked = True
@ -758,7 +758,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {error_str}",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "A conversation must start with a user message." in error_str:
exception_mapping_worked = True
@ -766,7 +766,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {error_str}\n. Pass in default user message via `completion(..,user_continue_message=)` or enable `litellm.modify_params=True`.\nFor Proxy: do via `litellm_settings::modify_params: True` or user_continue_message under `litellm_params`",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"Unable to locate credentials" in error_str
@ -778,7 +778,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException Invalid Authentication - {error_str}",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "AccessDeniedException" in error_str:
exception_mapping_worked = True
@ -786,7 +786,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException PermissionDeniedError - {error_str}",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"throttlingException" in error_str
@ -797,7 +797,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException: Rate Limit Error - {error_str}",
model=model,
llm_provider="bedrock",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"Connect timeout on endpoint URL" in error_str
@ -836,7 +836,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
llm_provider="bedrock",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 400:
exception_mapping_worked = True
@ -844,7 +844,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
llm_provider="bedrock",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 404:
exception_mapping_worked = True
@ -852,7 +852,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
llm_provider="bedrock",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -868,7 +868,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 429:
@ -877,7 +877,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 503:
@ -886,7 +886,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BedrockException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 504: # gateway timeout error
@ -907,7 +907,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"litellm.BadRequestError: SagemakerException - {error_str}",
model=model,
llm_provider="sagemaker",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"Input validation error: `best_of` must be > 0 and <= 2"
@ -918,7 +918,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message="SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints",
model=model,
llm_provider="sagemaker",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"`inputs` tokens + `max_new_tokens` must be <=" in error_str
@ -929,7 +929,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {error_str}",
model=model,
llm_provider="sagemaker",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif hasattr(original_exception, "status_code"):
if original_exception.status_code == 500:
@ -951,7 +951,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 400:
exception_mapping_worked = True
@ -959,7 +959,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 404:
exception_mapping_worked = True
@ -967,7 +967,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -986,7 +986,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 429:
@ -995,7 +995,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 503:
@ -1004,7 +1004,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"SagemakerException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 504: # gateway timeout error
@ -1124,13 +1124,10 @@ def exception_type( # type: ignore # noqa: PLR0915
),
),
)
elif (
"500 Internal Server Error" in error_str
or "The model is overloaded." in error_str
):
elif "500 Internal Server Error" in error_str:
exception_mapping_worked = True
raise litellm.InternalServerError(
message=f"litellm.InternalServerError: VertexAIException - {error_str}",
raise ServiceUnavailableError(
message=f"litellm.ServiceUnavailableError: VertexAIException - {error_str}",
model=model,
llm_provider="vertex_ai",
litellm_debug_info=extra_information,
@ -1217,7 +1214,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message="GeminiException - Invalid api key",
model=model,
llm_provider="palm",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if (
"504 Deadline expired before operation could complete." in error_str
@ -1235,7 +1232,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"GeminiException - {error_str}",
model=model,
llm_provider="palm",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if (
"500 An internal error has occurred." in error_str
@ -1262,7 +1259,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"GeminiException - {error_str}",
model=model,
llm_provider="palm",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
# Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
elif custom_llm_provider == "cloudflare":
@ -1272,7 +1269,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"Cloudflare Exception - {original_exception.message}",
llm_provider="cloudflare",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if "must have required property" in error_str:
exception_mapping_worked = True
@ -1280,7 +1277,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"Cloudflare Exception - {original_exception.message}",
llm_provider="cloudflare",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat"
@ -1294,7 +1291,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "too many tokens" in error_str:
exception_mapping_worked = True
@ -1302,7 +1299,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
model=model,
llm_provider="cohere",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif hasattr(original_exception, "status_code"):
if (
@ -1314,7 +1311,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -1329,7 +1326,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"CohereConnectionError" in exception_type
@ -1339,7 +1336,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "invalid type:" in error_str:
exception_mapping_worked = True
@ -1347,7 +1344,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Unexpected server error" in error_str:
exception_mapping_worked = True
@ -1355,7 +1352,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
else:
if hasattr(original_exception, "status_code"):
@ -1375,7 +1372,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=error_str,
model=model,
llm_provider="huggingface",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "A valid user token is required" in error_str:
exception_mapping_worked = True
@ -1383,7 +1380,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=error_str,
llm_provider="huggingface",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Rate limit reached" in error_str:
exception_mapping_worked = True
@ -1391,7 +1388,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=error_str,
llm_provider="huggingface",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if hasattr(original_exception, "status_code"):
if original_exception.status_code == 401:
@ -1400,7 +1397,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"HuggingfaceException - {original_exception.message}",
llm_provider="huggingface",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 400:
exception_mapping_worked = True
@ -1408,7 +1405,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"HuggingfaceException - {original_exception.message}",
model=model,
llm_provider="huggingface",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -1423,7 +1420,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"HuggingfaceException - {original_exception.message}",
llm_provider="huggingface",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 503:
exception_mapping_worked = True
@ -1431,7 +1428,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"HuggingfaceException - {original_exception.message}",
llm_provider="huggingface",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
else:
exception_mapping_worked = True
@ -1450,7 +1447,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AI21Exception - {original_exception.message}",
model=model,
llm_provider="ai21",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if "Bad or missing API token." in original_exception.message:
exception_mapping_worked = True
@ -1458,7 +1455,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AI21Exception - {original_exception.message}",
model=model,
llm_provider="ai21",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if hasattr(original_exception, "status_code"):
if original_exception.status_code == 401:
@ -1467,7 +1464,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AI21Exception - {original_exception.message}",
llm_provider="ai21",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -1482,7 +1479,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AI21Exception - {original_exception.message}",
model=model,
llm_provider="ai21",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
@ -1490,7 +1487,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AI21Exception - {original_exception.message}",
llm_provider="ai21",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
else:
exception_mapping_worked = True
@ -1509,7 +1506,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {error_str}",
model=model,
llm_provider="nlp_cloud",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "value is not a valid" in error_str:
exception_mapping_worked = True
@ -1517,7 +1514,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {error_str}",
model=model,
llm_provider="nlp_cloud",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
else:
exception_mapping_worked = True
@ -1542,7 +1539,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {original_exception.message}",
llm_provider="nlp_cloud",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
original_exception.status_code == 401
@ -1553,7 +1550,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {original_exception.message}",
llm_provider="nlp_cloud",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
original_exception.status_code == 522
@ -1574,7 +1571,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {original_exception.message}",
llm_provider="nlp_cloud",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
original_exception.status_code == 500
@ -1597,7 +1594,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NLPCloudException - {original_exception.message}",
model=model,
llm_provider="nlp_cloud",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
else:
exception_mapping_worked = True
@ -1623,7 +1620,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"error" in error_response
@ -1634,7 +1631,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
llm_provider="together_ai",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"error" in error_response
@ -1645,7 +1642,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "A timeout occurred" in error_str:
exception_mapping_worked = True
@ -1664,7 +1661,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif (
"error_type" in error_response
@ -1675,7 +1672,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
if hasattr(original_exception, "status_code"):
if original_exception.status_code == 408:
@ -1691,7 +1688,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {error_response['error']}",
model=model,
llm_provider="together_ai",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
@ -1699,7 +1696,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"TogetherAIException - {original_exception.message}",
llm_provider="together_ai",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 524:
exception_mapping_worked = True
@ -1727,7 +1724,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "InvalidToken" in error_str or "No token provided" in error_str:
exception_mapping_worked = True
@ -1735,7 +1732,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif hasattr(original_exception, "status_code"):
verbose_logger.debug(
@ -1754,7 +1751,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
@ -1762,7 +1759,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif original_exception.status_code == 500:
exception_mapping_worked = True
@ -1770,7 +1767,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AlephAlphaException - {original_exception.message}",
llm_provider="aleph_alpha",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
raise original_exception
raise original_exception
@ -1787,7 +1784,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}",
model=model,
llm_provider="ollama",
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Failed to establish a new connection" in error_str:
exception_mapping_worked = True
@ -1795,7 +1792,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"OllamaException: {original_exception}",
llm_provider="ollama",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Invalid response object from API" in error_str:
exception_mapping_worked = True
@ -1803,7 +1800,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"OllamaException: {original_exception}",
llm_provider="ollama",
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
)
elif "Read timed out" in error_str:
exception_mapping_worked = True
@ -1837,7 +1834,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif "This model's maximum context length is" in error_str:
exception_mapping_worked = True
@ -1846,7 +1842,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif "DeploymentNotFound" in error_str:
exception_mapping_worked = True
@ -1855,7 +1850,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif (
(
@ -1876,7 +1870,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif "invalid_request_error" in error_str:
exception_mapping_worked = True
@ -1885,7 +1878,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif (
"The api_key client option must be set either by passing api_key to the client or by setting"
@ -1897,7 +1889,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider=custom_llm_provider,
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif "Connection error" in error_str:
exception_mapping_worked = True
@ -1916,7 +1907,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif original_exception.status_code == 401:
exception_mapping_worked = True
@ -1925,7 +1915,6 @@ def exception_type( # type: ignore # noqa: PLR0915
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
@ -1942,7 +1931,6 @@ def exception_type( # type: ignore # noqa: PLR0915
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
@ -1951,7 +1939,6 @@ def exception_type( # type: ignore # noqa: PLR0915
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif original_exception.status_code == 503:
exception_mapping_worked = True
@ -1960,7 +1947,6 @@ def exception_type( # type: ignore # noqa: PLR0915
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
response=getattr(original_exception, "response", None),
)
elif original_exception.status_code == 504: # gateway timeout error
exception_mapping_worked = True
@ -2000,7 +1986,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"{exception_provider} - {error_str}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 401:
@ -2009,7 +1995,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"AuthenticationError: {exception_provider} - {error_str}",
llm_provider=custom_llm_provider,
model=model,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 404:
@ -2018,7 +2004,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"NotFoundError: {exception_provider} - {error_str}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 408:
@ -2035,7 +2021,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"BadRequestError: {exception_provider} - {error_str}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 429:
@ -2044,7 +2030,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"RateLimitError: {exception_provider} - {error_str}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 503:
@ -2053,7 +2039,7 @@ def exception_type( # type: ignore # noqa: PLR0915
message=f"ServiceUnavailableError: {exception_provider} - {error_str}",
model=model,
llm_provider=custom_llm_provider,
response=getattr(original_exception, "response", None),
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 504: # gateway timeout error

View file

@ -161,7 +161,17 @@ def get_supported_openai_params( # noqa: PLR0915
elif custom_llm_provider == "huggingface":
return litellm.HuggingfaceConfig().get_supported_openai_params()
elif custom_llm_provider == "together_ai":
return litellm.TogetherAIConfig().get_supported_openai_params(model=model)
return [
"stream",
"temperature",
"max_tokens",
"top_p",
"stop",
"frequency_penalty",
"tools",
"tool_choice",
"response_format",
]
elif custom_llm_provider == "ai21":
return [
"stream",

View file

@ -28,7 +28,6 @@ from litellm.caching.caching_handler import LLMCachingHandler
from litellm.cost_calculator import _select_model_name_for_cost_calc
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.mlflow import MlflowLogger
from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_custom_logger,
redact_message_input_output_from_logging,
@ -202,7 +201,6 @@ class Logging:
start_time,
litellm_call_id: str,
function_id: str,
litellm_trace_id: Optional[str] = None,
dynamic_input_callbacks: Optional[
List[Union[str, Callable, CustomLogger]]
] = None,
@ -240,7 +238,6 @@ class Logging:
self.start_time = start_time # log the call start time
self.call_type = call_type
self.litellm_call_id = litellm_call_id
self.litellm_trace_id = litellm_trace_id
self.function_id = function_id
self.streaming_chunks: List[Any] = [] # for generating complete stream response
self.sync_streaming_chunks: List[Any] = (
@ -277,11 +274,6 @@ class Logging:
self.completion_start_time: Optional[datetime.datetime] = None
self._llm_caching_handler: Optional[LLMCachingHandler] = None
self.model_call_details = {
"litellm_trace_id": litellm_trace_id,
"litellm_call_id": litellm_call_id,
}
def process_dynamic_callbacks(self):
"""
Initializes CustomLogger compatible callbacks in self.dynamic_* callbacks
@ -389,8 +381,7 @@ class Logging:
self.logger_fn = litellm_params.get("logger_fn", None)
verbose_logger.debug(f"self.optional_params: {self.optional_params}")
self.model_call_details.update(
{
self.model_call_details = {
"model": self.model,
"messages": self.messages,
"optional_params": self.optional_params,
@ -405,7 +396,6 @@ class Logging:
**self.optional_params,
**additional_params,
}
)
## check if stream options is set ## - used by CustomStreamWrapper for easy instrumentation
if "stream_options" in additional_params:
@ -564,7 +554,6 @@ class Logging:
message=f"Model Call Details pre-call: {details_to_log}",
level="info",
)
elif isinstance(callback, CustomLogger): # custom logger class
callback.log_pre_api_call(
model=self.model,
@ -934,10 +923,19 @@ class Logging:
status="success",
)
)
callbacks = get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_success_callbacks,
global_callbacks=litellm.success_callback,
)
if self.dynamic_success_callbacks is not None and isinstance(
self.dynamic_success_callbacks, list
):
callbacks = self.dynamic_success_callbacks
## keep the internal functions ##
for callback in litellm.success_callback:
if (
isinstance(callback, CustomLogger)
and "_PROXY_" in callback.__class__.__name__
):
callbacks.append(callback)
else:
callbacks = litellm.success_callback
## REDACT MESSAGES ##
result = redact_message_input_output_from_logging(
@ -1251,7 +1249,6 @@ class Logging:
end_time=end_time,
print_verbose=print_verbose,
)
if (
callback == "openmeter"
and self.model_call_details.get("litellm_params", {}).get(
@ -1359,11 +1356,8 @@ class Logging:
and customLogger is not None
): # custom logger functions
print_verbose(
"success callbacks: Running Custom Callback Function - {}".format(
callback
"success callbacks: Running Custom Callback Function"
)
)
customLogger.log_event(
kwargs=self.model_call_details,
response_obj=result,
@ -1460,10 +1454,21 @@ class Logging:
status="success",
)
)
callbacks = get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_async_success_callbacks,
global_callbacks=litellm._async_success_callback,
)
if self.dynamic_async_success_callbacks is not None and isinstance(
self.dynamic_async_success_callbacks, list
):
callbacks = self.dynamic_async_success_callbacks
## keep the internal functions ##
for callback in litellm._async_success_callback:
callback_name = ""
if isinstance(callback, CustomLogger):
callback_name = callback.__class__.__name__
if callable(callback):
callback_name = callback.__name__
if "_PROXY_" in callback_name:
callbacks.append(callback)
else:
callbacks = litellm._async_success_callback
result = redact_message_input_output_from_logging(
model_call_details=(
@ -1730,10 +1735,21 @@ class Logging:
start_time=start_time,
end_time=end_time,
)
callbacks = get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_failure_callbacks,
global_callbacks=litellm.failure_callback,
)
callbacks = [] # init this to empty incase it's not created
if self.dynamic_failure_callbacks is not None and isinstance(
self.dynamic_failure_callbacks, list
):
callbacks = self.dynamic_failure_callbacks
## keep the internal functions ##
for callback in litellm.failure_callback:
if (
isinstance(callback, CustomLogger)
and "_PROXY_" in callback.__class__.__name__
):
callbacks.append(callback)
else:
callbacks = litellm.failure_callback
result = None # result sent to all loggers, init this to None incase it's not created
@ -1916,10 +1932,21 @@ class Logging:
end_time=end_time,
)
callbacks = get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_async_failure_callbacks,
global_callbacks=litellm._async_failure_callback,
)
callbacks = [] # init this to empty incase it's not created
if self.dynamic_async_failure_callbacks is not None and isinstance(
self.dynamic_async_failure_callbacks, list
):
callbacks = self.dynamic_async_failure_callbacks
## keep the internal functions ##
for callback in litellm._async_failure_callback:
if (
isinstance(callback, CustomLogger)
and "_PROXY_" in callback.__class__.__name__
):
callbacks.append(callback)
else:
callbacks = litellm._async_failure_callback
result = None # result sent to all loggers, init this to None incase it's not created
for callback in callbacks:
@ -2311,15 +2338,6 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
_in_memory_loggers.append(_otel_logger)
return _otel_logger # type: ignore
elif logging_integration == "mlflow":
for callback in _in_memory_loggers:
if isinstance(callback, MlflowLogger):
return callback # type: ignore
_mlflow_logger = MlflowLogger()
_in_memory_loggers.append(_mlflow_logger)
return _mlflow_logger # type: ignore
def get_custom_logger_compatible_class(
logging_integration: litellm._custom_logger_compatible_callbacks_literal,
@ -2421,12 +2439,6 @@ def get_custom_logger_compatible_class(
and callback.callback_name == "langtrace"
):
return callback
elif logging_integration == "mlflow":
for callback in _in_memory_loggers:
if isinstance(callback, MlflowLogger):
return callback
return None
@ -2762,6 +2774,11 @@ def get_standard_logging_object_payload(
metadata=metadata
)
if litellm.cache is not None:
cache_key = litellm.cache.get_cache_key(**kwargs)
else:
cache_key = None
saved_cache_cost: float = 0.0
if cache_hit is True:
@ -2794,7 +2811,6 @@ def get_standard_logging_object_payload(
payload: StandardLoggingPayload = StandardLoggingPayload(
id=str(id),
trace_id=kwargs.get("litellm_trace_id"), # type: ignore
call_type=call_type or "",
cache_hit=cache_hit,
status=status,
@ -2804,7 +2820,7 @@ def get_standard_logging_object_payload(
completionStartTime=completion_start_time_float,
model=kwargs.get("model", "") or "",
metadata=clean_metadata,
cache_key=clean_hidden_params["cache_key"],
cache_key=cache_key,
response_cost=response_cost,
total_tokens=usage.total_tokens,
prompt_tokens=usage.prompt_tokens,
@ -2911,11 +2927,3 @@ def modify_integration(integration_name, integration_params):
if integration_name == "supabase":
if "table_name" in integration_params:
Supabase.supabase_table_name = integration_params["table_name"]
def get_combined_callback_list(
dynamic_success_callbacks: Optional[List], global_callbacks: List
) -> List:
if dynamic_success_callbacks is None:
return global_callbacks
return list(set(dynamic_success_callbacks + global_callbacks))

View file

@ -1,50 +0,0 @@
from typing import Optional
import litellm
class Rules:
"""
Fail calls based on the input or llm api output
Example usage:
import litellm
def my_custom_rule(input): # receives the model response
if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer
return False
return True
litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call
response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user",
"content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"])
"""
def __init__(self) -> None:
pass
def pre_call_rules(self, input: str, model: str):
for rule in litellm.pre_call_rules:
if callable(rule):
decision = rule(input)
if decision is False:
raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore
return True
def post_call_rules(self, input: Optional[str], model: str) -> bool:
if input is None:
return True
for rule in litellm.post_call_rules:
if callable(rule):
decision = rule(input)
if isinstance(decision, bool):
if decision is False:
raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore
elif isinstance(decision, dict):
decision_val = decision.get("decision", True)
decision_message = decision.get(
"message", "LLM Response failed post-call-rule check"
)
if decision_val is False:
raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model) # type: ignore
return True

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
from litellm.types.utils import GenericStreamingChunk as GChunk
def generic_chunk_has_all_required_fields(chunk: dict) -> bool:
"""
Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk.
:param chunk: The dictionary to check.
:return: True if all required fields are present, False otherwise.
"""
_all_fields = GChunk.__annotations__
decision = all(key in _all_fields for key in chunk)
return decision

View file

@ -12,11 +12,7 @@ from typing_extensions import overload
import litellm
from litellm.caching.caching import DualCache
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
get_async_httpx_client,
)
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.utils import EmbeddingResponse
from litellm.utils import (
CustomStreamWrapper,
@ -981,10 +977,7 @@ class AzureChatCompletion(BaseLLM):
else:
_params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0)
async_handler = get_async_httpx_client(
llm_provider=litellm.LlmProviders.AZURE,
params=_params,
)
async_handler = AsyncHTTPHandler(**_params) # type: ignore
else:
async_handler = client # type: ignore
@ -1528,8 +1521,7 @@ class AzureChatCompletion(BaseLLM):
prompt: Optional[str] = None,
) -> dict:
client_session = (
litellm.aclient_session
or get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE).client
litellm.aclient_session or httpx.AsyncClient()
) # handle dall-e-2 calls
if "gateway.ai.cloudflare.com" in api_base:

View file

@ -3,7 +3,7 @@ Support for gpt model family
"""
import types
from typing import List, Optional, Union
from typing import Optional, Union
import litellm
from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage
@ -163,8 +163,3 @@ class OpenAIGPTConfig:
model=model,
drop_params=drop_params,
)
def _transform_messages(
self, messages: List[AllMessageValues]
) -> List[AllMessageValues]:
return messages

View file

@ -17,6 +17,22 @@ from litellm.utils import CustomStreamWrapper
class OpenAIO1ChatCompletion(OpenAIChatCompletion):
async def mock_async_streaming(
self,
response: Any,
model: Optional[str],
logging_obj: Any,
):
model_response = await response
completion_stream = MockResponseIterator(model_response=model_response)
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
model=model,
custom_llm_provider="openai",
logging_obj=logging_obj,
)
return streaming_response
def completion(
self,
model_response: ModelResponse,
@ -38,7 +54,7 @@ class OpenAIO1ChatCompletion(OpenAIChatCompletion):
custom_llm_provider: Optional[str] = None,
drop_params: Optional[bool] = None,
):
# stream: Optional[bool] = optional_params.pop("stream", False)
stream: Optional[bool] = optional_params.pop("stream", False)
response = super().completion(
model_response,
timeout,
@ -60,4 +76,20 @@ class OpenAIO1ChatCompletion(OpenAIChatCompletion):
drop_params,
)
if stream is True:
if asyncio.iscoroutine(response):
return self.mock_async_streaming(
response=response, model=model, logging_obj=logging_obj # type: ignore
)
completion_stream = MockResponseIterator(model_response=response)
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
model=model,
custom_llm_provider="openai",
logging_obj=logging_obj,
)
return streaming_response
else:
return response

Some files were not shown because too many files have changed in this diff Show more