Merge branch 'main' into litellm_exp_mcp_server

This commit is contained in:
Ishaan Jaff 2025-03-24 19:03:56 -07:00
commit 08a4ba1b7e
58 changed files with 2991 additions and 627 deletions

View file

@ -1855,7 +1855,7 @@ jobs:
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
-e STORE_MODEL_IN_DB="True" \
-e LITELLM_MASTER_KEY="sk-1234" \
-e LITELLM_LICENSE=$LITELLM_LICENSE \

View file

@ -4,7 +4,8 @@ python-dotenv
tiktoken
importlib_metadata
cohere
redis
redis==5.2.1
redisvl==0.4.1
anthropic
orjson==3.9.15
pydantic==2.10.2

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
.python-version
.venv
.env
.newenv

View file

@ -37,9 +37,6 @@ RUN pip install dist/*.whl
# install dependencies as wheels
RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# ensure pyjwt is used, not jwt
RUN pip uninstall jwt -y
RUN pip uninstall PyJWT -y

View file

@ -1,35 +1,5 @@
version: "3.11"
services:
litellm:
build:
context: .
args:
target: runtime
image: ghcr.io/berriai/litellm:main-stable
#########################################
## Uncomment these lines to start proxy with a config.yaml file ##
# volumes:
# - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
# command:
# - "--config=/app/config.yaml"
##############################################
ports:
- "4000:4000" # Map the container port to the host, change the host port if necessary
environment:
DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file:
- .env # Load local .env file
depends_on:
- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
healthcheck: # Defines the health check configuration for the container
test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ] # Command to execute for health check
interval: 30s # Perform health check every 30 seconds
timeout: 10s # Health check command times out after 10 seconds
retries: 3 # Retry up to 3 times if health check fails
start_period: 40s # Wait 40 seconds after container start before beginning health checks
db:
image: postgres:16
restart: always
@ -46,25 +16,3 @@ services:
interval: 1s
timeout: 5s
retries: 10
prometheus:
image: prom/prometheus
volumes:
- prometheus_data:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
restart: always
volumes:
prometheus_data:
driver: local
postgres_data:
name: litellm_postgres_data # Named volume for Postgres data persistence
# ...rest of your docker-compose config if any

View file

@ -59,9 +59,6 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# ensure pyjwt is used, not jwt
RUN pip uninstall jwt -y
RUN pip uninstall PyJWT -y

View file

@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Install build dependencies
RUN apt-get clean && apt-get update && \
apt-get install -y gcc python3-dev && \
apt-get install -y gcc g++ python3-dev && \
rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir --upgrade pip && \
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
# ensure pyjwt is used, not jwt
RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
pip uninstall jwt -y && \
RUN pip uninstall jwt -y && \
pip uninstall PyJWT -y && \
pip install PyJWT==2.9.0 --no-cache-dir

View file

@ -26,7 +26,7 @@ Install redis
pip install redis
```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
```python
import litellm
@ -37,11 +37,11 @@ litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password
# Make completion calls
response1 = completion(
model="gpt-3.5-turbo",
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
)
response2 = completion(
model="gpt-3.5-turbo",
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
)
@ -91,12 +91,12 @@ response2 = completion(
<TabItem value="redis-sem" label="redis-semantic cache">
Install redis
Install redisvl client
```shell
pip install redisvl==0.0.7
pip install redisvl==0.4.1
```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
```python
import litellm
@ -114,6 +114,7 @@ litellm.cache = Cache(
port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"],
similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
ttl=120,
redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
)
response1 = completion(
@ -471,11 +472,13 @@ def __init__(
password: Optional[str] = None,
namespace: Optional[str] = None,
default_in_redis_ttl: Optional[float] = None,
similarity_threshold: Optional[float] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None,
# redis semantic cache params
similarity_threshold: Optional[float] = None,
redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
redis_semantic_cache_index_name: Optional[str] = None,
# s3 Bucket, boto3 configuration
s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None,

View file

@ -200,3 +200,92 @@ Expected Response
</TabItem>
</Tabs>
## OpenAI 'file' message type
This is currently only supported for OpenAI models.
This will be supported for all providers soon.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import base64
from litellm import completion
with open("draconomicon.pdf", "rb") as f:
data = f.read()
base64_string = base64.b64encode(data).decode("utf-8")
completion = completion(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
},
{
"type": "text",
"text": "What is the first dragon in the book?",
}
],
},
],
)
print(completion.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: openai-model
litellm_params:
model: gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "openai-model",
"messages": [
{"role": "user", "content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
}
]}
]
}'
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,308 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Using Web Search
Use web search with litellm
| Feature | Details |
|---------|---------|
| Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
| Supported Providers | `openai` |
| LiteLLM Cost Tracking | ✅ Supported |
| LiteLLM Version | `v1.63.15-nightly` or higher |
## `/chat/completions` (litellm.completion)
### Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import completion
response = completion(
model="openai/gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?",
}
],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4o-search-preview
litellm_params:
model: openai/gpt-4o-search-preview
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
]
)
```
</TabItem>
</Tabs>
### Search context size
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import completion
# Customize search context size
response = completion(
model="openai/gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?",
}
],
web_search_options={
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
# Customize search context size
response = client.chat.completions.create(
model="gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
web_search_options={
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}
)
```
</TabItem>
</Tabs>
## `/responses` (litellm.responses)
### Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import responses
response = responses(
model="openai/gpt-4o",
input=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
tools=[{
"type": "web_search_preview" # enables web search with default medium context size
}]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
response = client.responses.create(
model="gpt-4o",
tools=[{
"type": "web_search_preview"
}],
input="What was a positive news story from today?",
)
print(response.output_text)
```
</TabItem>
</Tabs>
### Search context size
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import responses
# Customize search context size
response = responses(
model="openai/gpt-4o",
input=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
tools=[{
"type": "web_search_preview",
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
# Customize search context size
response = client.responses.create(
model="gpt-4o",
tools=[{
"type": "web_search_preview",
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}],
input="What was a positive news story from today?",
)
print(response.output_text)
```
</TabItem>
</Tabs>
## Checking if a model supports web search
<Tabs>
<TabItem label="SDK" value="sdk">
Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
```python showLineNumbers
assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
```
</TabItem>
<TabItem label="PROXY" value="proxy">
1. Define OpenAI models in config.yaml
```yaml
model_list:
- model_name: gpt-4o-search-preview
litellm_params:
model: openai/gpt-4o-search-preview
api_key: os.environ/OPENAI_API_KEY
model_info:
supports_web_search: True
```
2. Run proxy server
```bash
litellm --config config.yaml
```
3. Call `/model_group/info` to check if a model supports web search
```shell
curl -X 'GET' \
'http://localhost:4000/model_group/info' \
-H 'accept: application/json' \
-H 'x-api-key: sk-1234'
```
Expected Response
```json showLineNumbers
{
"data": [
{
"model_group": "gpt-4o-search-preview",
"providers": ["openai"],
"max_tokens": 128000,
"supports_web_search": true, # 👈 supports_web_search is true
}
]
}
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,66 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# SSL Security Settings
If you're in an environment using an older TTS bundle, with an older encryption, follow this guide.
LiteLLM uses HTTPX for network requests, unless otherwise specified.
1. Disable SSL verification
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.ssl_verify = False
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
litellm_settings:
ssl_verify: false
```
</TabItem>
<TabItem value="env_var" label="Environment Variables">
```bash
export SSL_VERIFY="False"
```
</TabItem>
</Tabs>
2. Lower security settings
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.ssl_security_level = 1
litellm.ssl_certificate = "/path/to/certificate.pem"
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
litellm_settings:
ssl_security_level: 1
ssl_certificate: "/path/to/certificate.pem"
```
</TabItem>
<TabItem value="env_var" label="Environment Variables">
```bash
export SSL_SECURITY_LEVEL="1"
export SSL_CERTIFICATE="/path/to/certificate.pem"
```
</TabItem>
</Tabs>

View file

@ -1,4 +1,7 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Arize AI
@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm
:::
<Image img={require('../../img/arize.png')} />
## Pre-Requisites
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
```python
litellm.callbacks = ["arize"]
```
```python
import litellm
import os
@ -48,7 +55,7 @@ response = litellm.completion(
### Using with LiteLLM Proxy
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4
@ -60,13 +67,134 @@ model_list:
litellm_settings:
callbacks: ["arize"]
general_settings:
master_key: "sk-1234" # can also be set as an environment variable
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
```
## Pass Arize Space/Key per-request
Supported parameters:
- `arize_api_key`
- `arize_space_key`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set arize as a callback, litellm will send the data to arize
litellm.callbacks = ["arize"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
general_settings:
master_key: "sk-1234" # can also be set as an environment variable
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
<Tabs>
<TabItem value="curl" label="CURL">
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
"arize_space_key": "ARIZE_SPACE_2_KEY"
}'
```
</TabItem>
<TabItem value="openai_python" label="OpenAI Python">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
"arize_space_key": "ARIZE_SPACE_2_KEY"
}
)
print(response)
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)

View file

@ -291,14 +291,15 @@ response = completion(
)
```
## Azure O1 Models
## O-Series Models
| Model Name | Function Call |
|---------------------|----------------------------------------------------|
| o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
| o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
Azure OpenAI O-Series models are supported on LiteLLM.
Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support.
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
**Automatic Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
@ -306,60 +307,112 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
```python
import litellm
litellm.enable_preview_features = True # 👈 KEY CHANGE
response = litellm.completion(
model="azure/<your deployment name>",
messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
stream=True
)
for chunk in response:
print(chunk)
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="Proxy">
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: o1-mini
- model_name: o3-mini
litellm_params:
model: azure/o1-mini
api_base: "os.environ/AZURE_API_BASE"
api_key: "os.environ/AZURE_API_KEY"
api_version: "os.environ/AZURE_API_VERSION"
litellm_settings:
enable_preview_features: true # 👈 KEY CHANGE
model: azure/o3-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
2. Start proxy
</TabItem>
</Tabs>
**Explicit Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o_series/my-random-deployment-name
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
## Azure Audio Model
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
response = completion(
model="azure/azure-openai-4o-audio",
messages=[
{
"role": "user",
"content": "I want to try out speech to speech"
}
],
modalities=["text","audio"],
audio={"voice": "alloy", "format": "wav"}
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: azure-openai-4o-audio
litellm_params:
model: azure/azure-openai-4o-audio
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it
3. Test it!
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="o1-mini", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
stream=True)
for chunk in response:
print(chunk)
```bash
curl http://localhost:4000/v1/chat/completions \
-H "Authorization: Bearer $LITELLM_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "azure-openai-4o-audio",
"messages": [{"role": "user", "content": "I want to try out speech to speech"}],
"modalities": ["text","audio"],
"audio": {"voice": "alloy", "format": "wav"}
}'
```
</TabItem>
</Tabs>
@ -948,62 +1001,9 @@ Expected Response:
{"data":[{"id":"batch_R3V...}
```
## O-Series Models
Azure OpenAI O-Series models are supported on LiteLLM.
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
**Automatic Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o3-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
**Explicit Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o_series/my-random-deployment-name
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>

View file

@ -1428,10 +1428,14 @@ response = litellm.embedding(
## Supported AWS Bedrock Models
LiteLLM supports ALL Bedrock models.
Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
| Model Name | Command |
|----------------------------|------------------------------------------------------------------|
| Deepseek R1 | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |

View file

@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
</TabItem>
</Tabs>
## Using Ollama FIM on `/v1/completions`
LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm._turn_on_debug() # turn on debug to see the request
from litellm import completion
response = completion(
model="ollama/llama3.1",
prompt="Hello, world!",
api_base="http://localhost:11434"
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: "llama3.1"
litellm_params:
model: "ollama/llama3.1"
api_base: "http://localhost:11434"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml --detailed_debug
# RUNNING ON http://0.0.0.0:4000
```
3. Test it!
```python
from openai import OpenAI
client = OpenAI(
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
)
response = client.completions.create(
model="ollama/llama3.1",
prompt="Hello, world!",
api_base="http://localhost:11434"
)
print(response)
```
</TabItem>
</Tabs>
## Using ollama `api/chat`
In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

View file

@ -228,6 +228,92 @@ response = completion(
```
## PDF File Parsing
OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import base64
from litellm import completion
with open("draconomicon.pdf", "rb") as f:
data = f.read()
base64_string = base64.b64encode(data).decode("utf-8")
completion = completion(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
},
{
"type": "text",
"text": "What is the first dragon in the book?",
}
],
},
],
)
print(completion.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: openai-model
litellm_params:
model: gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "openai-model",
"messages": [
{"role": "user", "content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
}
]}
]
}'
```
</TabItem>
</Tabs>
## OpenAI Fine Tuned Models
| Model Name | Function Call |
@ -449,26 +535,6 @@ response = litellm.acompletion(
)
```
### Using Helicone Proxy with LiteLLM
```python
import os
import litellm
from litellm import completion
os.environ["OPENAI_API_KEY"] = ""
# os.environ["OPENAI_API_BASE"] = ""
litellm.api_base = "https://oai.hconeai.com/v1"
litellm.headers = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
"Helicone-Cache-Enabled": "true",
}
messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call
response = completion("gpt-3.5-turbo", messages)
```
### Using OpenAI Proxy with LiteLLM
```python

View file

@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
import os
from litellm import completion
os.environ["OPENROUTER_API_KEY"] = ""
os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1
os.environ["OR_SITE_URL"] = "" # optional
os.environ["OR_APP_NAME"] = "" # optional
os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
os.environ["OR_APP_NAME"] = "" # [OPTIONAL]
response = completion(
model="openrouter/google/palm-2-chat-bison",

View file

@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
response: str,
):
pass
aasync def async_post_call_streaming_iterator_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_data: dict,
) -> AsyncGenerator[ModelResponseStream, None]:
"""
Passes the entire stream to the guardrail
This is useful for plugins that need to see the entire stream.
"""
async for item in response:
yield item
proxy_handler_instance = MyCustomHandler()
```

View file

@ -147,6 +147,7 @@ general_settings:
|------|------|-------------|
| completion_model | string | The default model to use for completions when `model` is not specified in the request |
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |

View file

@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail
### 1. Write a `CustomGuardrail` Class
A CustomGuardrail has 3 methods to enforce guardrails
A CustomGuardrail has 4 methods to enforce guardrails
- `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
- `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
- `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
**[See detailed spec of methods here](#customguardrail-methods)**
@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
):
raise ValueError("Guardrail failed Coffee Detected")
async def async_post_call_streaming_iterator_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_data: dict,
) -> AsyncGenerator[ModelResponseStream, None]:
"""
Passes the entire stream to the guardrail
This is useful for guardrails that need to see the entire response, such as PII masking.
See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
Triggered by mode: 'post_call'
"""
async for item in response:
yield item
```

View file

@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
| `response_cost` | `Optional[str]` | Optional response cost |
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
| `litellm_model_name` | `Optional[str]` | Model name sent in request |
## StandardLoggingModelInformation

View file

@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
| `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
## Cost Tracking Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-response-cost` | float | Cost of the API call |
| `x-litellm-key-spend` | float | Total spend for the API key |
| Header | Type | Description | Available on Pass-Through Endpoints |
|--------|------|-------------|-------------|
| `x-litellm-response-cost` | float | Cost of the API call | |
| `x-litellm-key-spend` | float | Total spend for the API key | ✅ |
## LiteLLM Specific Headers
| Header | Type | Description |
|--------|------|-------------|
| `x-litellm-call-id` | string | Unique identifier for the API call |
| `x-litellm-model-id` | string | Unique identifier for the model used |
| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
| `x-litellm-version` | string | Version of LiteLLM being used |
| `x-litellm-model-group` | string | Model group identifier |
| Header | Type | Description | Available on Pass-Through Endpoints |
|--------|------|-------------|-------------|
| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
| `x-litellm-model-id` | string | Unique identifier for the model used | |
| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
| `x-litellm-version` | string | Version of LiteLLM being used | |
| `x-litellm-model-group` | string | Model group identifier | |
## Response headers from LLM providers

Binary file not shown.

After

Width:  |  Height:  |  Size: 707 KiB

View file

@ -26,14 +26,6 @@ This release is primarily focused on:
- UI - Credential Management, re-use credentials when adding new models
- UI - Test Connection to LLM Provider before adding a model
:::info
This release will be live on 03/16/2025
:::
<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
## Known Issues
- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test

View file

@ -0,0 +1,130 @@
---
title: v1.63.14-stable
slug: v1.63.14-stable
date: 2025-03-22T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: [credential management, thinking content, responses api, snowflake]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
These are the changes since `v1.63.11-stable`.
This release brings:
- LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
- Perf improvements for Usage-based Routing
- Streaming guardrail support via websockets
## Docker Run LiteLLM Proxy
```
docker run
-e STORE_MODEL_IN_DB=True
-p 4000:4000
ghcr.io/berriai/litellm:main-v1.63.14-stable
```
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
- Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361)
- O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397)
- Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
- Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
## LLM Translation
1. **New LLM Features**
- Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile)
- Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`)
- Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions)
- Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models)
- OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md)
- Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model)
- OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type)
- OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming)
- [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp)
2. **Bug Fixes**
- Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e)
- Sagemaker - Fix Too little data for declared Content-Length error - [PR](https://github.com/BerriAI/litellm/pull/9326)
- OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355)
- VertexAI - Embedding outputDimensionality support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304)
- Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437)
## Spend Tracking Improvements
- `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk
- Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329)
- Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314)
## UI
1. Users Page
- Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328)
2. Icons:
- Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374)
3. Sign In/Sign Out
- Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395)
## Logging Integrations
- Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class)
- Arize [Get Started](../../docs/observability/arize_integration)
- fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338)
- migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338)
- fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353)
- Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request)
- StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams)
- Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md)
## Performance / Reliability improvements
- Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf)
- Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331)
- Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
- Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
- Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support)
- Usage-based routing - Support batch writing increments to redis - reduces latency to same as simple-shuffle [PR](https://github.com/BerriAI/litellm/pull/9357)
- Router - show reason for model cooldown on no healthy deployments available error [PR](https://github.com/BerriAI/litellm/pull/9438)
- Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image urls being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448)
## General Improvements
- Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers)
- SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings)
- Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376)
- Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling)
- OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a)
- Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb)
## Complete Git Diff
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc)

View file

@ -243,7 +243,9 @@ const sidebars = {
"exception_mapping",
"completion/provider_specific_params",
"guides/finetuned_models",
"guides/security_settings",
"completion/audio",
"completion/web_search",
"completion/document_understanding",
"completion/vision",
"completion/json_mode",

View file

@ -122,6 +122,9 @@ langsmith_batch_size: Optional[int] = None
prometheus_initialize_budget_metrics: Optional[bool] = False
argilla_batch_size: Optional[int] = None
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
gcs_pub_sub_use_v1: Optional[bool] = (
False # if you want to use v1 gcs pubsub logged payload
)
argilla_transformation_object: Optional[Dict[str, Any]] = None
_async_input_callback: List[Union[str, Callable, CustomLogger]] = (
[]
@ -756,6 +759,7 @@ from .utils import (
create_pretrained_tokenizer,
create_tokenizer,
supports_function_calling,
supports_web_search,
supports_response_schema,
supports_parallel_function_calling,
supports_vision,

View file

@ -88,16 +88,16 @@ class Cache:
s3_aws_session_token: Optional[str] = None,
s3_config: Optional[Any] = None,
s3_path: Optional[str] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
redis_semantic_cache_index_name: Optional[str] = None,
redis_flush_size: Optional[int] = None,
redis_startup_nodes: Optional[List] = None,
disk_cache_dir=None,
disk_cache_dir: Optional[str] = None,
qdrant_api_base: Optional[str] = None,
qdrant_api_key: Optional[str] = None,
qdrant_collection_name: Optional[str] = None,
qdrant_quantization_config: Optional[str] = None,
qdrant_semantic_cache_embedding_model="text-embedding-ada-002",
qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
**kwargs,
):
"""
@ -170,8 +170,8 @@ class Cache:
port=port,
password=password,
similarity_threshold=similarity_threshold,
use_async=redis_semantic_cache_use_async,
embedding_model=redis_semantic_cache_embedding_model,
index_name=redis_semantic_cache_index_name,
**kwargs,
)
elif type == LiteLLMCacheType.QDRANT_SEMANTIC:

View file

@ -1,337 +1,437 @@
"""
Redis Semantic Cache implementation
Redis Semantic Cache implementation for LiteLLM
Has 4 methods:
- set_cache
- get_cache
- async_set_cache
- async_get_cache
The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
This cache stores responses based on the semantic similarity of prompts rather than
exact matching, allowing for more flexible caching of LLM responses.
This implementation uses RedisVL's SemanticCache to find semantically similar prompts
and their cached responses.
"""
import ast
import asyncio
import json
from typing import Any
import os
from typing import Any, Dict, List, Optional, Tuple
import litellm
from litellm._logging import print_verbose
from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
from .base_cache import BaseCache
class RedisSemanticCache(BaseCache):
"""
Redis-backed semantic cache for LLM responses.
This cache uses vector similarity to find semantically similar prompts that have been
previously sent to the LLM, allowing for cache hits even when prompts are not identical
but carry similar meaning.
"""
DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
def __init__(
self,
host=None,
port=None,
password=None,
redis_url=None,
similarity_threshold=None,
use_async=False,
embedding_model="text-embedding-ada-002",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
redis_url: Optional[str] = None,
similarity_threshold: Optional[float] = None,
embedding_model: str = "text-embedding-ada-002",
index_name: Optional[str] = None,
**kwargs,
):
from redisvl.index import SearchIndex
print_verbose(
"redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
)
if similarity_threshold is None:
raise Exception("similarity_threshold must be provided, passed None")
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model
schema = {
"index": {
"name": "litellm_semantic_cache_index",
"prefix": "litellm",
"storage_type": "hash",
},
"fields": {
"text": [{"name": "response"}],
"vector": [
{
"name": "litellm_embedding",
"dims": 1536,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
}
],
},
}
if redis_url is None:
# if no url passed, check if host, port and password are passed, if not raise an Exception
if host is None or port is None or password is None:
# try checking env for host, port and password
import os
host = os.getenv("REDIS_HOST")
port = os.getenv("REDIS_PORT")
password = os.getenv("REDIS_PASSWORD")
if host is None or port is None or password is None:
raise Exception("Redis host, port, and password must be provided")
redis_url = "redis://:" + password + "@" + host + ":" + port
print_verbose(f"redis semantic-cache redis_url: {redis_url}")
if use_async is False:
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url)
try:
self.index.create(overwrite=False) # don't overwrite existing index
except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
elif use_async is True:
schema["index"]["name"] = "litellm_semantic_cache_index_async"
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url, use_async=True)
#
def _get_cache_logic(self, cached_response: Any):
"""
Common 'get_cache_logic' across sync + async redis client implementations
Initialize the Redis Semantic Cache.
Args:
host: Redis host address
port: Redis port
password: Redis password
redis_url: Full Redis URL (alternative to separate host/port/password)
similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
where 1.0 requires exact matches and 0.0 accepts any match
embedding_model: Model to use for generating embeddings
index_name: Name for the Redis index
ttl: Default time-to-live for cache entries in seconds
**kwargs: Additional arguments passed to the Redis client
Raises:
Exception: If similarity_threshold is not provided or required Redis
connection information is missing
"""
from redisvl.extensions.llmcache import SemanticCache
from redisvl.utils.vectorize import CustomTextVectorizer
if index_name is None:
index_name = self.DEFAULT_REDIS_INDEX_NAME
print_verbose(f"Redis semantic-cache initializing index - {index_name}")
# Validate similarity threshold
if similarity_threshold is None:
raise ValueError("similarity_threshold must be provided, passed None")
# Store configuration
self.similarity_threshold = similarity_threshold
# Convert similarity threshold [0,1] to distance threshold [0,2]
# For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
self.distance_threshold = 1 - similarity_threshold
self.embedding_model = embedding_model
# Set up Redis connection
if redis_url is None:
try:
# Attempt to use provided parameters or fallback to environment variables
host = host or os.environ['REDIS_HOST']
port = port or os.environ['REDIS_PORT']
password = password or os.environ['REDIS_PASSWORD']
except KeyError as e:
# Raise a more informative exception if any of the required keys are missing
missing_var = e.args[0]
raise ValueError(f"Missing required Redis configuration: {missing_var}. "
f"Provide {missing_var} or redis_url.") from e
redis_url = f"redis://:{password}@{host}:{port}"
print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
# Initialize the Redis vectorizer and cache
cache_vectorizer = CustomTextVectorizer(self._get_embedding)
self.llmcache = SemanticCache(
name=index_name,
redis_url=redis_url,
vectorizer=cache_vectorizer,
distance_threshold=self.distance_threshold,
overwrite=False,
)
def _get_ttl(self, **kwargs) -> Optional[int]:
"""
Get the TTL (time-to-live) value for cache entries.
Args:
**kwargs: Keyword arguments that may contain a custom TTL
Returns:
Optional[int]: The TTL value in seconds, or None if no TTL should be applied
"""
ttl = kwargs.get("ttl")
if ttl is not None:
ttl = int(ttl)
return ttl
def _get_embedding(self, prompt: str) -> List[float]:
"""
Generate an embedding vector for the given prompt using the configured embedding model.
Args:
prompt: The text to generate an embedding for
Returns:
List[float]: The embedding vector
"""
# Create an embedding from prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
embedding = embedding_response["data"][0]["embedding"]
return embedding
def _get_cache_logic(self, cached_response: Any) -> Any:
"""
Process the cached response to prepare it for use.
Args:
cached_response: The raw cached response
Returns:
The processed cache response, or None if input was None
"""
if cached_response is None:
return cached_response
# check if cached_response is bytes
# Convert bytes to string if needed
if isinstance(cached_response, bytes):
cached_response = cached_response.decode("utf-8")
# Convert string representation to Python object
try:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except Exception:
cached_response = ast.literal_eval(cached_response)
cached_response = json.loads(cached_response)
except json.JSONDecodeError:
try:
cached_response = ast.literal_eval(cached_response)
except (ValueError, SyntaxError) as e:
print_verbose(f"Error parsing cached response: {str(e)}")
return None
return cached_response
def set_cache(self, key, value, **kwargs):
import numpy as np
print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
# get the prompt
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
# create an embedding for prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
# make the embedding a numpy array, convert to bytes
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
value = str(value)
assert isinstance(value, str)
new_data = [
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
]
# Add more data
self.index.load(new_data)
return
def get_cache(self, key, **kwargs):
print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
# query
# get the messages
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
# convert to embedding
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
query = VectorQuery(
vector=embedding,
vector_field_name="litellm_embedding",
return_fields=["response", "prompt", "vector_distance"],
num_results=1,
)
results = self.index.query(query)
if results is None:
return None
if isinstance(results, list):
if len(results) == 0:
return None
vector_distance = results[0]["vector_distance"]
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# check similarity, if more than self.similarity_threshold, return results
print_verbose(
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
)
if similarity > self.similarity_threshold:
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass
async def async_set_cache(self, key, value, **kwargs):
import numpy as np
from litellm.proxy.proxy_server import llm_model_list, llm_router
def set_cache(self, key: str, value: Any, **kwargs) -> None:
"""
Store a value in the semantic cache.
Args:
key: The cache key (not directly used in semantic caching)
value: The response value to cache
**kwargs: Additional arguments including 'messages' for the prompt
and optional 'ttl' for time-to-live
"""
print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")
try:
await self.index.acreate(overwrite=False) # don't overwrite existing index
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic caching")
return
prompt = get_str_from_messages(messages)
value_str = str(value)
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
if ttl is not None:
self.llmcache.store(prompt, value_str, ttl=int(ttl))
else:
self.llmcache.store(prompt, value_str)
except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")
# get the prompt
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
# create an embedding for prompt
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
if llm_router is not None and self.embedding_model in router_model_names:
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# convert to embedding
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
def get_cache(self, key: str, **kwargs) -> Any:
"""
Retrieve a semantically similar cached response.
Args:
key: The cache key (not directly used in semantic caching)
**kwargs: Additional arguments including 'messages' for the prompt
Returns:
The cached response if a semantically similar prompt is found, else None
"""
print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
try:
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic cache lookup")
return None
prompt = get_str_from_messages(messages)
# Check the cache for semantically similar prompts
results = self.llmcache.check(prompt=prompt)
# make the embedding a numpy array, convert to bytes
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
value = str(value)
assert isinstance(value, str)
new_data = [
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
]
# Add more data
await self.index.aload(new_data)
return
async def async_get_cache(self, key, **kwargs):
print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
from litellm.proxy.proxy_server import llm_model_list, llm_router
# query
# get the messages
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
if llm_router is not None and self.embedding_model in router_model_names:
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# convert to embedding
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
query = VectorQuery(
vector=embedding,
vector_field_name="litellm_embedding",
return_fields=["response", "prompt", "vector_distance"],
)
results = await self.index.aquery(query)
if results is None:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
return None
if isinstance(results, list):
if len(results) == 0:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
# Return None if no similar prompts found
if not results:
return None
vector_distance = results[0]["vector_distance"]
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# Process the best matching result
cache_hit = results[0]
vector_distance = float(cache_hit["vector_distance"])
# Convert vector distance back to similarity score
# For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
similarity = 1 - vector_distance
cached_prompt = cache_hit["prompt"]
cached_response = cache_hit["response"]
# check similarity, if more than self.similarity_threshold, return results
print_verbose(
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
)
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
if similarity > self.similarity_threshold:
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
f"actual similarity: {similarity}, "
f"current prompt: {prompt}, "
f"cached prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass
return self._get_cache_logic(cached_response=cached_response)
except Exception as e:
print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
"""
Asynchronously generate an embedding for the given prompt.
Args:
prompt: The text to generate an embedding for
**kwargs: Additional arguments that may contain metadata
Returns:
List[float]: The embedding vector
"""
from litellm.proxy.proxy_server import llm_model_list, llm_router
async def _index_info(self):
return await self.index.ainfo()
# Route the embedding request through the proxy if appropriate
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
try:
if llm_router is not None and self.embedding_model in router_model_names:
# Use the router for embedding generation
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# Generate embedding directly
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
async def async_set_cache_pipeline(self, cache_list, **kwargs):
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)
# Extract and return the embedding vector
return embedding_response["data"][0]["embedding"]
except Exception as e:
print_verbose(f"Error generating async embedding: {str(e)}")
raise ValueError(f"Failed to generate embedding: {str(e)}") from e
async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
"""
Asynchronously store a value in the semantic cache.
Args:
key: The cache key (not directly used in semantic caching)
value: The response value to cache
**kwargs: Additional arguments including 'messages' for the prompt
and optional 'ttl' for time-to-live
"""
print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
try:
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic caching")
return
prompt = get_str_from_messages(messages)
value_str = str(value)
# Generate embedding for the value (response) to cache
prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
if ttl is not None:
await self.llmcache.astore(
prompt,
value_str,
vector=prompt_embedding, # Pass through custom embedding
ttl=ttl
)
else:
await self.llmcache.astore(
prompt,
value_str,
vector=prompt_embedding # Pass through custom embedding
)
except Exception as e:
print_verbose(f"Error in async_set_cache: {str(e)}")
async def async_get_cache(self, key: str, **kwargs) -> Any:
"""
Asynchronously retrieve a semantically similar cached response.
Args:
key: The cache key (not directly used in semantic caching)
**kwargs: Additional arguments including 'messages' for the prompt
Returns:
The cached response if a semantically similar prompt is found, else None
"""
print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
try:
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic cache lookup")
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
return None
prompt = get_str_from_messages(messages)
# Generate embedding for the prompt
prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
# Check the cache for semantically similar prompts
results = await self.llmcache.acheck(
prompt=prompt,
vector=prompt_embedding
)
# handle results / cache hit
if not results:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
return None
cache_hit = results[0]
vector_distance = float(cache_hit["vector_distance"])
# Convert vector distance back to similarity
# For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
similarity = 1 - vector_distance
cached_prompt = cache_hit["prompt"]
cached_response = cache_hit["response"]
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
print_verbose(
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
f"actual similarity: {similarity}, "
f"current prompt: {prompt}, "
f"cached prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_response)
except Exception as e:
print_verbose(f"Error in async_get_cache: {str(e)}")
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
async def _index_info(self) -> Dict[str, Any]:
"""
Get information about the Redis index.
Returns:
Dict[str, Any]: Information about the Redis index
"""
aindex = await self.llmcache._get_async_index()
return await aindex.info()
async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
"""
Asynchronously store multiple values in the semantic cache.
Args:
cache_list: List of (key, value) tuples to cache
**kwargs: Additional arguments
"""
try:
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)
except Exception as e:
print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")

View file

@ -9,6 +9,9 @@ from pydantic import BaseModel
import litellm
import litellm._logging
from litellm import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token,
@ -57,6 +60,7 @@ from litellm.types.utils import (
LlmProvidersSet,
ModelInfo,
PassthroughCallTypes,
StandardBuiltInToolsParams,
Usage,
)
from litellm.utils import (
@ -524,6 +528,7 @@ def completion_cost( # noqa: PLR0915
optional_params: Optional[dict] = None,
custom_pricing: Optional[bool] = None,
base_model: Optional[str] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -802,6 +807,12 @@ def completion_cost( # noqa: PLR0915
rerank_billed_units=rerank_billed_units,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
_final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=completion_response,
standard_built_in_tools_params=standard_built_in_tools_params,
custom_llm_provider=custom_llm_provider,
)
return _final_cost
except Exception as e:
@ -861,6 +872,7 @@ def response_cost_calculator(
base_model: Optional[str] = None,
custom_pricing: Optional[bool] = None,
prompt: str = "",
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float:
"""
Returns
@ -890,6 +902,7 @@ def response_cost_calculator(
custom_pricing=custom_pricing,
base_model=base_model,
prompt=prompt,
standard_built_in_tools_params=standard_built_in_tools_params,
)
return response_cost
except Exception as e:

View file

@ -10,13 +10,16 @@ import asyncio
import json
import os
import traceback
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from litellm.types.utils import StandardLoggingPayload
if TYPE_CHECKING:
from litellm.proxy._types import SpendLogsPayload
else:
SpendLogsPayload = Any
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.llms.custom_httpx.http_handler import (
@ -61,7 +64,7 @@ class GcsPubSubLogger(CustomBatchLogger):
self.flush_lock = asyncio.Lock()
super().__init__(**kwargs, flush_lock=self.flush_lock)
asyncio.create_task(self.periodic_flush())
self.log_queue: List[SpendLogsPayload] = []
self.log_queue: List[Union[SpendLogsPayload, StandardLoggingPayload]] = []
async def construct_request_headers(self) -> Dict[str, str]:
"""Construct authorization headers using Vertex AI auth"""
@ -115,13 +118,20 @@ class GcsPubSubLogger(CustomBatchLogger):
verbose_logger.debug(
"PubSub: Logging - Enters logging function for model %s", kwargs
)
spend_logs_payload = get_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
self.log_queue.append(spend_logs_payload)
standard_logging_payload = kwargs.get("standard_logging_object", None)
# Backwards compatibility with old logging payload
if litellm.gcs_pub_sub_use_v1 is True:
spend_logs_payload = get_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
self.log_queue.append(spend_logs_payload)
else:
# New logging payload, StandardLoggingPayload
self.log_queue.append(standard_logging_payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
@ -155,7 +165,7 @@ class GcsPubSubLogger(CustomBatchLogger):
self.log_queue.clear()
async def publish_message(
self, message: SpendLogsPayload
self, message: Union[SpendLogsPayload, StandardLoggingPayload]
) -> Optional[Dict[str, Any]]:
"""
Publish message to Google Cloud Pub/Sub using REST API

View file

@ -35,6 +35,9 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.mlflow import MlflowLogger
from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_custom_logger,
@ -60,6 +63,7 @@ from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
RawRequestTypedDict,
StandardBuiltInToolsParams,
StandardCallbackDynamicParams,
StandardLoggingAdditionalHeaders,
StandardLoggingHiddenParams,
@ -264,7 +268,9 @@ class Logging(LiteLLMLoggingBaseClass):
self.standard_callback_dynamic_params: StandardCallbackDynamicParams = (
self.initialize_standard_callback_dynamic_params(kwargs)
)
self.standard_built_in_tools_params: StandardBuiltInToolsParams = (
self.initialize_standard_built_in_tools_params(kwargs)
)
## TIME TO FIRST TOKEN LOGGING ##
self.completion_start_time: Optional[datetime.datetime] = None
self._llm_caching_handler: Optional[LLMCachingHandler] = None
@ -369,6 +375,23 @@ class Logging(LiteLLMLoggingBaseClass):
"""
return _initialize_standard_callback_dynamic_params(kwargs)
def initialize_standard_built_in_tools_params(
self, kwargs: Optional[Dict] = None
) -> StandardBuiltInToolsParams:
"""
Initialize the standard built-in tools params from the kwargs
checks if web_search_options in kwargs or tools and sets the corresponding attribute in StandardBuiltInToolsParams
"""
return StandardBuiltInToolsParams(
web_search_options=StandardBuiltInToolCostTracking._get_web_search_options(
kwargs or {}
),
file_search=StandardBuiltInToolCostTracking._get_file_search_tool_call(
kwargs or {}
),
)
def update_environment_variables(
self,
litellm_params: Dict,
@ -495,6 +518,16 @@ class Logging(LiteLLMLoggingBaseClass):
}
return data
def _get_masked_api_base(self, api_base: str) -> str:
if "key=" in api_base:
# Find the position of "key=" in the string
key_index = api_base.find("key=") + 4
# Mask the last 5 characters after "key="
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
else:
masked_api_base = api_base
return str(masked_api_base)
def _pre_call(self, input, api_key, model=None, additional_args={}):
"""
Common helper function across the sync + async pre-call function
@ -508,6 +541,9 @@ class Logging(LiteLLMLoggingBaseClass):
model
): # if model name was changes pre-call, overwrite the initial model call name with the new one
self.model_call_details["model"] = model
self.model_call_details["litellm_params"]["api_base"] = (
self._get_masked_api_base(additional_args.get("api_base", ""))
)
def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915
@ -691,15 +727,6 @@ class Logging(LiteLLMLoggingBaseClass):
headers = {}
data = additional_args.get("complete_input_dict", {})
api_base = str(additional_args.get("api_base", ""))
if "key=" in api_base:
# Find the position of "key=" in the string
key_index = api_base.find("key=") + 4
# Mask the last 5 characters after "key="
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
else:
masked_api_base = api_base
self.model_call_details["litellm_params"]["api_base"] = masked_api_base
curl_command = self._get_request_curl_command(
api_base=api_base,
headers=headers,
@ -714,11 +741,12 @@ class Logging(LiteLLMLoggingBaseClass):
def _get_request_curl_command(
self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
) -> str:
masked_api_base = self._get_masked_api_base(api_base)
if headers is None:
headers = {}
curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
curl_command += "curl -X POST \\\n"
curl_command += f"{api_base} \\\n"
curl_command += f"{masked_api_base} \\\n"
masked_headers = self._get_masked_headers(headers)
formatted_headers = " ".join(
[f"-H '{k}: {v}'" for k, v in masked_headers.items()]
@ -903,6 +931,7 @@ class Logging(LiteLLMLoggingBaseClass):
"optional_params": self.optional_params,
"custom_pricing": custom_pricing,
"prompt": prompt,
"standard_built_in_tools_params": self.standard_built_in_tools_params,
}
except Exception as e: # error creating kwargs for cost calculation
debug_info = StandardLoggingModelCostFailureDebugInformation(
@ -1067,6 +1096,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
elif isinstance(result, dict): # pass-through endpoints
@ -1079,6 +1109,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
elif standard_logging_object is not None:
@ -1102,6 +1133,7 @@ class Logging(LiteLLMLoggingBaseClass):
prompt="",
completion=getattr(result, "content", ""),
total_time=float_diff,
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
return start_time, end_time, result
@ -1155,6 +1187,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
callbacks = self.get_combined_callback_list(
@ -1695,6 +1728,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
callbacks = self.get_combined_callback_list(
@ -1911,6 +1945,7 @@ class Logging(LiteLLMLoggingBaseClass):
status="failure",
error_str=str(exception),
original_exception=exception,
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
return start_time, end_time
@ -3367,6 +3402,7 @@ def get_standard_logging_object_payload(
status: StandardLoggingPayloadStatus,
error_str: Optional[str] = None,
original_exception: Optional[Exception] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> Optional[StandardLoggingPayload]:
try:
kwargs = kwargs or {}
@ -3542,6 +3578,7 @@ def get_standard_logging_object_payload(
guardrail_information=metadata.get(
"standard_logging_guardrail_information", None
),
standard_built_in_tools_params=standard_built_in_tools_params,
)
emit_standard_logging_payload(payload)

View file

@ -0,0 +1,199 @@
"""
Helper utilities for tracking the cost of built-in tools.
"""
from typing import Any, Dict, List, Optional
import litellm
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import (
ModelInfo,
ModelResponse,
SearchContextCostPerQuery,
StandardBuiltInToolsParams,
)
class StandardBuiltInToolCostTracking:
"""
Helper class for tracking the cost of built-in tools
Example: Web Search
"""
@staticmethod
def get_cost_for_built_in_tools(
model: str,
response_object: Any,
custom_llm_provider: Optional[str] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float:
"""
Get the cost of using built-in tools.
Supported tools:
- Web Search
"""
if standard_built_in_tools_params is not None:
if (
standard_built_in_tools_params.get("web_search_options", None)
is not None
):
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
return StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=standard_built_in_tools_params.get(
"web_search_options", None
),
model_info=model_info,
)
if standard_built_in_tools_params.get("file_search", None) is not None:
return StandardBuiltInToolCostTracking.get_cost_for_file_search(
file_search=standard_built_in_tools_params.get("file_search", None),
)
if isinstance(response_object, ModelResponse):
if StandardBuiltInToolCostTracking.chat_completion_response_includes_annotations(
response_object
):
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
model_info
)
return 0.0
@staticmethod
def _safe_get_model_info(
model: str, custom_llm_provider: Optional[str] = None
) -> Optional[ModelInfo]:
try:
return litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
except Exception:
return None
@staticmethod
def get_cost_for_web_search(
web_search_options: Optional[WebSearchOptions] = None,
model_info: Optional[ModelInfo] = None,
) -> float:
"""
If request includes `web_search_options`, calculate the cost of the web search.
"""
if web_search_options is None:
return 0.0
if model_info is None:
return 0.0
search_context_pricing: SearchContextCostPerQuery = (
model_info.get("search_context_cost_per_query", {}) or {}
)
if web_search_options.get("search_context_size", None) == "low":
return search_context_pricing.get("search_context_size_low", 0.0)
elif web_search_options.get("search_context_size", None) == "medium":
return search_context_pricing.get("search_context_size_medium", 0.0)
elif web_search_options.get("search_context_size", None) == "high":
return search_context_pricing.get("search_context_size_high", 0.0)
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
model_info
)
@staticmethod
def get_default_cost_for_web_search(
model_info: Optional[ModelInfo] = None,
) -> float:
"""
If no web search options are provided, use the `search_context_size_medium` pricing.
https://platform.openai.com/docs/pricing#web-search
"""
if model_info is None:
return 0.0
search_context_pricing: SearchContextCostPerQuery = (
model_info.get("search_context_cost_per_query", {}) or {}
) or {}
return search_context_pricing.get("search_context_size_medium", 0.0)
@staticmethod
def get_cost_for_file_search(
file_search: Optional[FileSearchTool] = None,
) -> float:
""" "
Charged at $2.50/1k calls
Doc: https://platform.openai.com/docs/pricing#built-in-tools
"""
if file_search is None:
return 0.0
return 2.5 / 1000
@staticmethod
def chat_completion_response_includes_annotations(
response_object: ModelResponse,
) -> bool:
for _choice in response_object.choices:
message = getattr(_choice, "message", None)
if (
message is not None
and hasattr(message, "annotations")
and message.annotations is not None
and len(message.annotations) > 0
):
return True
return False
@staticmethod
def _get_web_search_options(kwargs: Dict) -> Optional[WebSearchOptions]:
if "web_search_options" in kwargs:
return WebSearchOptions(**kwargs.get("web_search_options", {}))
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
kwargs, "web_search_preview"
)
if tools:
# Look for web search tool in the tools array
for tool in tools:
if isinstance(tool, dict):
if StandardBuiltInToolCostTracking._is_web_search_tool_call(tool):
return WebSearchOptions(**tool)
return None
@staticmethod
def _get_tools_from_kwargs(kwargs: Dict, tool_type: str) -> Optional[List[Dict]]:
if "tools" in kwargs:
tools = kwargs.get("tools", [])
return tools
return None
@staticmethod
def _get_file_search_tool_call(kwargs: Dict) -> Optional[FileSearchTool]:
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
kwargs, "file_search"
)
if tools:
for tool in tools:
if isinstance(tool, dict):
if StandardBuiltInToolCostTracking._is_file_search_tool_call(tool):
return FileSearchTool(**tool)
return None
@staticmethod
def _is_web_search_tool_call(tool: Dict) -> bool:
if tool.get("type", None) == "web_search_preview":
return True
if "search_context_size" in tool:
return True
return False
@staticmethod
def _is_file_search_tool_call(tool: Dict) -> bool:
if tool.get("type", None) == "file_search":
return True
return False

View file

@ -138,13 +138,22 @@ class ModelParamHelper:
TranscriptionCreateParamsNonStreaming,
TranscriptionCreateParamsStreaming,
)
non_streaming_kwargs = set(getattr(TranscriptionCreateParamsNonStreaming, "__annotations__", {}).keys())
streaming_kwargs = set(getattr(TranscriptionCreateParamsStreaming, "__annotations__", {}).keys())
non_streaming_kwargs = set(
getattr(
TranscriptionCreateParamsNonStreaming, "__annotations__", {}
).keys()
)
streaming_kwargs = set(
getattr(
TranscriptionCreateParamsStreaming, "__annotations__", {}
).keys()
)
all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs)
return all_transcription_kwargs
except Exception as e:
verbose_logger.warning("Error getting transcription kwargs %s", str(e))
verbose_logger.debug("Error getting transcription kwargs %s", str(e))
return set()
@staticmethod

View file

@ -5304,6 +5304,17 @@
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"text-embedding-large-exp-03-07": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"output_vector_size": 3072,
"input_cost_per_character": 0.000000025,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0,
"litellm_provider": "vertex_ai-embedding-models",
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"textembedding-gecko": {
"max_tokens": 3072,
"max_input_tokens": 3072,

View file

@ -5,7 +5,10 @@ model_list:
api_key: os.environ/AZURE_API_KEY
api_base: http://0.0.0.0:8090
rpm: 3
- model_name: "gpt-4o-mini-openai"
litellm_params:
model: gpt-4o-mini
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
num_retries: 0

View file

@ -542,13 +542,10 @@ async def vertex_proxy_route(
user_api_key_dict,
stream=is_streaming_request, # type: ignore
)
except Exception as e:
except ProxyException as e:
if headers_passed_through:
raise Exception(
f"No credentials found on proxy for this request. Headers were passed through directly but request failed with error: {str(e)}"
)
else:
raise e
e.message = f"No credentials found on proxy for project_name={vertex_project} + location={vertex_location}, check `/model/info` for allowed project + region combinations with `use_in_pass_through: true`. Headers were passed through directly but request failed with error: {e.message}"
raise e
return received_value

View file

@ -1788,9 +1788,6 @@ class ProxyConfig:
reset_color_code,
cache_password,
)
if cache_type == "redis-semantic":
# by default this should always be async
cache_params.update({"redis_semantic_cache_use_async": True})
# users can pass os.environ/ variables on the proxy - we should read them from the env
for key, value in cache_params.items():
@ -6181,18 +6178,18 @@ async def model_info_v1( # noqa: PLR0915
)
if len(all_models_str) > 0:
model_names = all_models_str
llm_model_list = llm_router.get_model_list()
_relevant_models = []
for model in all_models_str:
router_models = llm_router.get_model_list(model_name=model)
if router_models is not None:
_relevant_models.extend(router_models)
if llm_model_list is not None:
_relevant_models = [
m for m in llm_model_list if m["model_name"] in model_names
]
all_models = copy.deepcopy(_relevant_models) # type: ignore
else:
all_models = []
for model in all_models:
model = _get_proxy_model_info(model=model)
for in_place_model in all_models:
in_place_model = _get_proxy_model_info(model=in_place_model)
verbose_proxy_logger.debug("all_models: %s", all_models)
return {"data": all_models}

View file

@ -4924,6 +4924,11 @@ class Router:
and model_info["supports_function_calling"] is True # type: ignore
):
model_group_info.supports_function_calling = True
if (
model_info.get("supports_web_search", None) is not None
and model_info["supports_web_search"] is True # type: ignore
):
model_group_info.supports_web_search = True
if (
model_info.get("supported_openai_params", None) is not None
and model_info["supported_openai_params"] is not None
@ -5286,10 +5291,11 @@ class Router:
if len(returned_models) == 0: # check if wildcard route
potential_wildcard_models = self.pattern_router.route(model_name)
if potential_wildcard_models is not None:
returned_models.extend(
[DeploymentTypedDict(**m) for m in potential_wildcard_models] # type: ignore
)
if model_name is not None and potential_wildcard_models is not None:
for m in potential_wildcard_models:
deployment_typed_dict = DeploymentTypedDict(**m) # type: ignore
deployment_typed_dict["model_name"] = model_name
returned_models.append(deployment_typed_dict)
if model_name is None:
returned_models += self.model_list

View file

@ -382,6 +382,53 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
city: str
"""Free text input for the city of the user, e.g. `San Francisco`."""
country: str
"""
The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
the user, e.g. `US`.
"""
region: str
"""Free text input for the region of the user, e.g. `California`."""
timezone: str
"""
The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
user, e.g. `America/Los_Angeles`.
"""
class WebSearchOptionsUserLocation(TypedDict, total=False):
approximate: Required[WebSearchOptionsUserLocationApproximate]
"""Approximate location parameters for the search."""
type: Required[Literal["approximate"]]
"""The type of location approximation. Always `approximate`."""
class WebSearchOptions(TypedDict, total=False):
search_context_size: Literal["low", "medium", "high"]
"""
High level guidance for the amount of context window space to use for the
search. One of `low`, `medium`, or `high`. `medium` is the default.
"""
user_location: Optional[WebSearchOptionsUserLocation]
"""Approximate location parameters for the search."""
class FileSearchTool(TypedDict, total=False):
type: Literal["file_search"]
"""The type of tool being defined: `file_search`"""
vector_store_ids: Optional[List[str]]
"""The IDs of the vector stores to search."""
class ChatCompletionAnnotationURLCitation(TypedDict, total=False):
end_index: int
"""The index of the last character of the URL citation in the message."""

View file

@ -559,6 +559,7 @@ class ModelGroupInfo(BaseModel):
rpm: Optional[int] = None
supports_parallel_function_calling: bool = Field(default=False)
supports_vision: bool = Field(default=False)
supports_web_search: bool = Field(default=False)
supports_function_calling: bool = Field(default=False)
supported_openai_params: Optional[List[str]] = Field(default=[])
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None

View file

@ -32,7 +32,9 @@ from .llms.openai import (
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionUsageBlock,
FileSearchTool,
OpenAIChatCompletionChunk,
WebSearchOptions,
)
from .rerank import RerankResponse
@ -97,6 +99,13 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
supports_pdf_input: Optional[bool]
supports_native_streaming: Optional[bool]
supports_parallel_function_calling: Optional[bool]
supports_web_search: Optional[bool]
class SearchContextCostPerQuery(TypedDict, total=False):
search_context_size_low: float
search_context_size_medium: float
search_context_size_high: float
class ModelInfoBase(ProviderSpecificModelInfo, total=False):
@ -135,6 +144,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
output_cost_per_video_per_second: Optional[float] # only for vertex ai models
output_cost_per_audio_per_second: Optional[float] # only for vertex ai models
output_cost_per_second: Optional[float] # for OpenAI Speech models
search_context_cost_per_query: Optional[
SearchContextCostPerQuery
] # Cost for using web search tool
litellm_provider: Required[str]
mode: Required[
@ -586,6 +598,11 @@ class Message(OpenAIObject):
# OpenAI compatible APIs like mistral API will raise an error if audio is passed in
del self.audio
if annotations is None:
# ensure default response matches OpenAI spec
# Some OpenAI compatible APIs raise an error if annotations are passed in
del self.annotations
if reasoning_content is None:
# ensure default response matches OpenAI spec
del self.reasoning_content
@ -1612,6 +1629,19 @@ class StandardLoggingUserAPIKeyMetadata(TypedDict):
user_api_key_end_user_id: Optional[str]
class StandardBuiltInToolsParams(TypedDict, total=False):
"""
Standard built-in OpenAItools parameters
This is used to calculate the cost of built-in tools, insert any standard built-in tools parameters here
OpenAI charges users based on the `web_search_options` parameter
"""
web_search_options: Optional[WebSearchOptions]
file_search: Optional[FileSearchTool]
class StandardLoggingPromptManagementMetadata(TypedDict):
prompt_id: str
prompt_variables: Optional[dict]
@ -1729,6 +1759,7 @@ class StandardLoggingPayload(TypedDict):
model_parameters: dict
hidden_params: StandardLoggingHiddenParams
guardrail_information: Optional[StandardLoggingGuardrailInformation]
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams]
from typing import AsyncIterator, Iterator

View file

@ -1975,7 +1975,7 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
)
def supports_web_search(model: str, custom_llm_provider: Optional[str]) -> bool:
def supports_web_search(model: str, custom_llm_provider: Optional[str] = None) -> bool:
"""
Check if the given model supports web search and return a boolean value.
@ -4544,6 +4544,10 @@ def _get_model_info_helper( # noqa: PLR0915
supports_native_streaming=_model_info.get(
"supports_native_streaming", None
),
supports_web_search=_model_info.get("supports_web_search", False),
search_context_cost_per_query=_model_info.get(
"search_context_cost_per_query", None
),
tpm=_model_info.get("tpm", None),
rpm=_model_info.get("rpm", None),
)
@ -4612,6 +4616,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
supports_audio_input: Optional[bool]
supports_audio_output: Optional[bool]
supports_pdf_input: Optional[bool]
supports_web_search: Optional[bool]
Raises:
Exception: If the model is not mapped yet.

View file

@ -5304,6 +5304,17 @@
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"text-embedding-large-exp-03-07": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"output_vector_size": 3072,
"input_cost_per_character": 0.000000025,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0,
"litellm_provider": "vertex_ai-embedding-models",
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"textembedding-gecko": {
"max_tokens": 3072,
"max_input_tokens": 3072,

78
poetry.lock generated
View file

@ -810,15 +810,15 @@ test = ["pytest (>=6)"]
[[package]]
name = "fastapi"
version = "0.115.11"
version = "0.115.12"
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "extra == \"proxy\""
files = [
{file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
{file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
{file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"},
{file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"},
]
[package.dependencies]
@ -1445,14 +1445,14 @@ type = ["pytest-mypy"]
[[package]]
name = "iniconfig"
version = "2.0.0"
version = "2.1.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
{file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
{file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
]
[[package]]
@ -2137,14 +2137,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]]
name = "openai"
version = "1.66.3"
version = "1.68.2"
description = "The official Python library for the openai API"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9"},
{file = "openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9"},
{file = "openai-1.68.2-py3-none-any.whl", hash = "sha256:24484cb5c9a33b58576fdc5acf0e5f92603024a4e39d0b99793dfa1eb14c2b36"},
{file = "openai-1.68.2.tar.gz", hash = "sha256:b720f0a95a1dbe1429c0d9bb62096a0d98057bcda82516f6e8af10284bdd5b19"},
]
[package.dependencies]
@ -2160,6 +2160,7 @@ typing-extensions = ">=4.11,<5"
[package.extras]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<15)"]
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
[[package]]
name = "orjson"
@ -2477,24 +2478,24 @@ testing = ["google-api-core (>=1.31.5)"]
[[package]]
name = "protobuf"
version = "5.29.3"
version = "5.29.4"
description = ""
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "extra == \"extra-proxy\""
files = [
{file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"},
{file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"},
{file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"},
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"},
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"},
{file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"},
{file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"},
{file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"},
{file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"},
{file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"},
{file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"},
{file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
{file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
{file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
{file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
{file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
{file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
{file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
{file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
{file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
{file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
]
[[package]]
@ -2809,6 +2810,25 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-asyncio"
version = "0.21.2"
description = "Pytest support for asyncio"
optional = false
python-versions = ">=3.7"
groups = ["dev"]
files = [
{file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
{file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
]
[package.dependencies]
pytest = ">=7.0.0"
[package.extras]
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
[[package]]
name = "pytest-mock"
version = "3.14.0"
@ -3279,15 +3299,15 @@ files = [
[[package]]
name = "rq"
version = "2.1.0"
version = "2.2.0"
description = "RQ is a simple, lightweight, library for creating background jobs, and processing them."
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "extra == \"proxy\""
files = [
{file = "rq-2.1.0-py3-none-any.whl", hash = "sha256:3c6892c6ca848e5fb47c1875399a66f13656bf0e123bf725d9aa9a12718e2fdf"},
{file = "rq-2.1.0.tar.gz", hash = "sha256:764585b6cab69ef1412f4aee523347e5aa7ece3ca175c118b1d92223dd8c2826"},
{file = "rq-2.2.0-py3-none-any.whl", hash = "sha256:dacbfe1ccb79a45c8cd95dec7951620679fa0195570b63da3f9347622d33accc"},
{file = "rq-2.2.0.tar.gz", hash = "sha256:b636760f1e4c183022031c142faa0483e687885824e9732ba2953f994104e203"},
]
[package.dependencies]
@ -3606,15 +3626,15 @@ files = [
[[package]]
name = "tzdata"
version = "2025.1"
version = "2025.2"
description = "Provider of IANA time zone data"
optional = true
python-versions = ">=2"
groups = ["main"]
markers = "extra == \"proxy\" and platform_system == \"Windows\""
files = [
{file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
{file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
{file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
{file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
]
[[package]]
@ -3985,4 +4005,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
[metadata]
lock-version = "2.1"
python-versions = ">=3.8.1,<4.0, !=3.9.7"
content-hash = "f7c21b3d659e4a15cd46bb42fb905ad039028f4f6b82507fd1278ac05c412569"
content-hash = "9c863b11189227a035a9130c8872de44fe7c5e1e32b47569a56af86e3f6570c5"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.63.14"
version = "1.64.0"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -98,13 +98,14 @@ black = "^23.12.0"
mypy = "^1.0"
pytest = "^7.4.3"
pytest-mock = "^3.12.0"
pytest-asyncio = "^0.21.1"
[build-system]
requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.63.14"
version = "1.64.0"
version_files = [
"pyproject.toml:^version"
]

View file

@ -9,8 +9,8 @@ uvicorn==0.29.0 # server dep
gunicorn==23.0.0 # server dep
uvloop==0.21.0 # uvicorn dep, gives us much better performance under load
boto3==1.34.34 # aws bedrock/sagemaker calls
redis==5.0.0 # caching
numpy==2.1.1 # semantic caching
redis==5.2.1 # redis caching
redisvl==0.4.1 # semantic caching
prisma==0.11.0 # for db
mangum==0.17.0 # for aws lambda functions
pynacl==1.5.0 # for encrypting keys

View file

@ -1,13 +1,8 @@
import asyncio
import json
import os
import sys
import time
from unittest.mock import MagicMock, patch
import httpx
import pytest
import respx
from fastapi.testclient import TestClient
sys.path.insert(
@ -18,9 +13,18 @@ from unittest.mock import AsyncMock
from litellm.caching.redis_cache import RedisCache
@pytest.fixture
def redis_no_ping():
"""Patch RedisCache initialization to prevent async ping tasks from being created"""
with patch('asyncio.get_running_loop') as mock_get_loop:
# Either raise an exception or return a mock that will handle the task creation
mock_get_loop.side_effect = RuntimeError("No running event loop")
yield
@pytest.mark.parametrize("namespace", [None, "test"])
@pytest.mark.asyncio
async def test_redis_cache_async_increment(namespace, monkeypatch):
async def test_redis_cache_async_increment(namespace, monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
redis_cache = RedisCache(namespace=namespace)
# Create an AsyncMock for the Redis client
@ -47,10 +51,46 @@ async def test_redis_cache_async_increment(namespace, monkeypatch):
@pytest.mark.asyncio
async def test_redis_client_init_with_socket_timeout(monkeypatch):
async def test_redis_client_init_with_socket_timeout(monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "my-fake-host")
redis_cache = RedisCache(socket_timeout=1.0)
assert redis_cache.redis_kwargs["socket_timeout"] == 1.0
client = redis_cache.init_async_client()
assert client is not None
assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0
@pytest.mark.asyncio
async def test_redis_cache_async_batch_get_cache(monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
redis_cache = RedisCache()
# Create an AsyncMock for the Redis client
mock_redis_instance = AsyncMock()
# Make sure the mock can be used as an async context manager
mock_redis_instance.__aenter__.return_value = mock_redis_instance
mock_redis_instance.__aexit__.return_value = None
# Setup the return value for mget
mock_redis_instance.mget.return_value = [
b'{"key1": "value1"}',
None,
b'{"key3": "value3"}'
]
test_keys = ["key1", "key2", "key3"]
with patch.object(
redis_cache, "init_async_client", return_value=mock_redis_instance
):
# Call async_batch_get_cache
result = await redis_cache.async_batch_get_cache(key_list=test_keys)
# Verify mget was called with the correct keys
mock_redis_instance.mget.assert_called_once()
# Check that results were properly decoded
assert result["key1"] == {"key1": "value1"}
assert result["key2"] is None
assert result["key3"] == {"key3": "value3"}

View file

@ -0,0 +1,130 @@
import os
import sys
from unittest.mock import MagicMock, patch, AsyncMock
import pytest
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
# Tests for RedisSemanticCache
def test_redis_semantic_cache_initialization(monkeypatch):
# Mock the redisvl import
semantic_cache_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=MagicMock())
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize the cache with a similarity threshold
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Verify the semantic cache was initialized with correct parameters
assert redis_semantic_cache.similarity_threshold == 0.8
# Use pytest.approx for floating point comparison to handle precision issues
assert redis_semantic_cache.distance_threshold == pytest.approx(0.2, abs=1e-10)
assert redis_semantic_cache.embedding_model == "text-embedding-ada-002"
# Test initialization with missing similarity_threshold
with pytest.raises(ValueError, match="similarity_threshold must be provided"):
RedisSemanticCache()
def test_redis_semantic_cache_get_cache(monkeypatch):
# Mock the redisvl import and embedding function
semantic_cache_mock = MagicMock()
custom_vectorizer_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize cache
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Mock the llmcache.check method to return a result
mock_result = [
{
"prompt": "What is the capital of France?",
"response": '{"content": "Paris is the capital of France."}',
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
}
]
redis_semantic_cache.llmcache.check = MagicMock(return_value=mock_result)
# Mock the embedding function
with patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}):
# Test get_cache with a message
result = redis_semantic_cache.get_cache(
key="test_key",
messages=[{"content": "What is the capital of France?"}]
)
# Verify result is properly parsed
assert result == {"content": "Paris is the capital of France."}
# Verify llmcache.check was called
redis_semantic_cache.llmcache.check.assert_called_once()
@pytest.mark.asyncio
async def test_redis_semantic_cache_async_get_cache(monkeypatch):
# Mock the redisvl import
semantic_cache_mock = MagicMock()
custom_vectorizer_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize cache
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Mock the async methods
mock_result = [
{
"prompt": "What is the capital of France?",
"response": '{"content": "Paris is the capital of France."}',
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
}
]
redis_semantic_cache.llmcache.acheck = AsyncMock(return_value=mock_result)
redis_semantic_cache._get_async_embedding = AsyncMock(return_value=[0.1, 0.2, 0.3])
# Test async_get_cache with a message
result = await redis_semantic_cache.async_get_cache(
key="test_key",
messages=[{"content": "What is the capital of France?"}],
metadata={}
)
# Verify result is properly parsed
assert result == {"content": "Paris is the capital of France."}
# Verify methods were called
redis_semantic_cache._get_async_embedding.assert_called_once()
redis_semantic_cache.llmcache.acheck.assert_called_once()

View file

@ -0,0 +1,113 @@
import json
import os
import sys
import pytest
from fastapi.testclient import TestClient
import litellm
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
# Test basic web search cost calculations
def test_web_search_cost_low():
web_search_options = WebSearchOptions(search_context_size="low")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost == model_info["search_context_cost_per_query"]["search_context_size_low"]
)
def test_web_search_cost_medium():
web_search_options = WebSearchOptions(search_context_size="medium")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost
== model_info["search_context_cost_per_query"]["search_context_size_medium"]
)
def test_web_search_cost_high():
web_search_options = WebSearchOptions(search_context_size="high")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost == model_info["search_context_cost_per_query"]["search_context_size_high"]
)
# Test file search cost calculation
def test_file_search_cost():
file_search = FileSearchTool(type="file_search")
cost = StandardBuiltInToolCostTracking.get_cost_for_file_search(
file_search=file_search
)
assert cost == 0.0025 # $2.50/1000 calls = 0.0025 per call
# Test edge cases
def test_none_inputs():
# Test with None inputs
assert (
StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=None, model_info=None
)
== 0.0
)
assert (
StandardBuiltInToolCostTracking.get_cost_for_file_search(file_search=None)
== 0.0
)
# Test the main get_cost_for_built_in_tools method
def test_get_cost_for_built_in_tools_web_search():
model = "gpt-4"
standard_built_in_tools_params = StandardBuiltInToolsParams(
web_search_options=WebSearchOptions(search_context_size="medium")
)
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=None,
standard_built_in_tools_params=standard_built_in_tools_params,
)
assert isinstance(cost, float)
def test_get_cost_for_built_in_tools_file_search():
model = "gpt-4"
standard_built_in_tools_params = StandardBuiltInToolsParams(
file_search=FileSearchTool(type="file_search")
)
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=None,
standard_built_in_tools_params=standard_built_in_tools_params,
)
assert cost == 0.0025

View file

@ -0,0 +1,34 @@
import json
import os
import sys
from unittest.mock import MagicMock, patch
import pytest
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
import time
from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
@pytest.fixture
def logging_obj():
return LitellmLogging(
model="bedrock/claude-3-5-sonnet-20240620-v1:0",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
)
def test_get_masked_api_base(logging_obj):
api_base = "https://api.openai.com/v1"
masked_api_base = logging_obj._get_masked_api_base(api_base)
assert masked_api_base == "https://api.openai.com/v1"
assert type(masked_api_base) == str

View file

@ -1,3 +1,4 @@
import asyncio
import datetime
import json
import os
@ -11,7 +12,13 @@ sys.path.insert(
0, os.path.abspath("../../../..")
) # Adds the parent directory to the system path
from unittest.mock import MagicMock, patch
import litellm
from litellm.proxy._types import SpendLogsPayload
from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
from litellm.proxy.proxy_server import app, prisma_client
from litellm.router import Router
@pytest.fixture
@ -400,3 +407,270 @@ async def test_ui_view_spend_logs_unauthorized(client):
headers={"Authorization": "Bearer invalid-token"},
)
assert response.status_code == 401 or response.status_code == 403
class TestSpendLogsPayload:
@pytest.mark.asyncio
async def test_spend_logs_payload_e2e(self):
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(litellm.proxy.proxy_server, "prisma_client"):
response = await litellm.acompletion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello, world!"}],
mock_response="Hello, world!",
metadata={"user_api_key_end_user_id": "test_user_1"},
)
assert response.choices[0].message.content == "Hello, world!"
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "gpt-4o",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
"cache_key": "Cache OFF",
"spend": 0.00022500000000000002,
"total_tokens": 30,
"prompt_tokens": 10,
"completion_tokens": 20,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "",
"model_group": "",
"model_id": "",
"requester_ip_address": None,
"custom_llm_provider": "openai",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"
def mock_anthropic_response(*args, **kwargs):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/json"}
mock_response.json.return_value = {
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
"model": "claude-3-7-sonnet-20250219",
"role": "assistant",
"stop_reason": "end_turn",
"stop_sequence": None,
"type": "message",
"usage": {"input_tokens": 2095, "output_tokens": 503},
}
return mock_response
@pytest.mark.asyncio
async def test_spend_logs_payload_success_log_with_api_base(self):
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
client = AsyncHTTPHandler()
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(
litellm.proxy.proxy_server, "prisma_client"
), patch.object(
client, "post", side_effect=self.mock_anthropic_response
):
response = await litellm.acompletion(
model="claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "Hello, world!"}],
metadata={"user_api_key_end_user_id": "test_user_1"},
client=client,
)
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "claude-3-7-sonnet-20250219",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
"cache_key": "Cache OFF",
"spend": 0.01383,
"total_tokens": 2598,
"prompt_tokens": 2095,
"completion_tokens": 503,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "https://api.anthropic.com/v1/messages",
"model_group": "",
"model_id": "",
"requester_ip_address": None,
"custom_llm_provider": "anthropic",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"
@pytest.mark.asyncio
async def test_spend_logs_payload_success_log_with_router(self):
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
client = AsyncHTTPHandler()
router = Router(
model_list=[
{
"model_name": "my-anthropic-model-group",
"litellm_params": {
"model": "claude-3-7-sonnet-20250219",
},
"model_info": {
"id": "my-unique-model-id",
},
}
]
)
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(
litellm.proxy.proxy_server, "prisma_client"
), patch.object(
client, "post", side_effect=self.mock_anthropic_response
):
response = await router.acompletion(
model="my-anthropic-model-group",
messages=[{"role": "user", "content": "Hello, world!"}],
metadata={"user_api_key_end_user_id": "test_user_1"},
client=client,
)
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "claude-3-7-sonnet-20250219",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
"cache_key": "Cache OFF",
"spend": 0.01383,
"total_tokens": 2598,
"prompt_tokens": 2095,
"completion_tokens": 503,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "https://api.anthropic.com/v1/messages",
"model_group": "my-anthropic-model-group",
"model_id": "my-unique-model-id",
"requester_ip_address": None,
"custom_llm_provider": "anthropic",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"

View file

@ -477,6 +477,25 @@ def test_supports_function_calling(model, expected_bool):
pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
"model, expected_bool",
[
("gpt-4o-mini-search-preview", True),
("openai/gpt-4o-mini-search-preview", True),
("gpt-4o-search-preview", True),
("openai/gpt-4o-search-preview", True),
("groq/deepseek-r1-distill-llama-70b", False),
("groq/llama-3.3-70b-versatile", False),
("codestral/codestral-latest", False),
],
)
def test_supports_web_search(model, expected_bool):
try:
assert litellm.supports_web_search(model=model) == expected_bool
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_get_max_token_unit_test():
"""
More complete testing in `test_completion_cost.py`

View file

@ -794,7 +794,7 @@ def test_redis_cache_completion():
response3 = completion(
model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
)
response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
response4 = completion(model="gpt-4o-mini", messages=messages, caching=True)
print("\nresponse 1", response1)
print("\nresponse 2", response2)
@ -1690,20 +1690,12 @@ def test_cache_context_managers():
print("VARS of litellm.cache", vars(litellm.cache))
# test_cache_context_managers()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
def test_redis_semantic_cache_completion():
litellm.set_verbose = True
import logging
logging.basicConfig(level=logging.DEBUG)
random_number = random.randint(
1, 100000
) # add a random number to ensure it's always adding /reading from cache
print("testing semantic caching")
litellm.cache = Cache(
type="redis-semantic",
@ -1718,33 +1710,30 @@ def test_redis_semantic_cache_completion():
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
"content": "write a one sentence poem about summer",
}
],
max_tokens=20,
)
print(f"response1: {response1}")
random_number = random.randint(1, 100000)
response2 = completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
"content": "write a one sentence poem about summertime",
}
],
max_tokens=20,
)
print(f"response2: {response1}")
print(f"response2: {response2}")
assert response1.id == response2.id
# test_redis_cache_completion()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
@pytest.mark.asyncio
async def test_redis_semantic_cache_acompletion():
litellm.set_verbose = True
@ -1752,38 +1741,32 @@ async def test_redis_semantic_cache_acompletion():
logging.basicConfig(level=logging.DEBUG)
random_number = random.randint(
1, 100000
) # add a random number to ensure it's always adding / reading from cache
print("testing semantic caching")
litellm.cache = Cache(
type="redis-semantic",
host=os.environ["REDIS_HOST"],
port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"],
similarity_threshold=0.8,
redis_semantic_cache_use_async=True,
similarity_threshold=0.7,
)
response1 = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
"content": "write a one sentence poem about summer",
}
],
max_tokens=5,
)
print(f"response1: {response1}")
random_number = random.randint(1, 100000)
response2 = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
"content": "write a one sentence poem about summertime",
}
],
max_tokens=5,

View file

@ -0,0 +1,175 @@
{
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
"trace_id": null,
"call_type": "acompletion",
"cache_hit": null,
"stream": true,
"status": "success",
"custom_llm_provider": "openai",
"saved_cache_cost": 0.0,
"startTime": "2025-01-24 09:20:46.847371",
"endTime": "2025-01-24 09:20:46.851954",
"completionStartTime": "2025-01-24 09:20:46.851954",
"response_time": 0.007394075393676758,
"model": "gpt-4o",
"metadata": {
"user_api_key_hash": null,
"user_api_key_alias": null,
"user_api_key_team_id": null,
"user_api_key_org_id": null,
"user_api_key_user_id": null,
"user_api_key_team_alias": null,
"user_api_key_user_email": null,
"spend_logs_metadata": null,
"requester_ip_address": null,
"requester_metadata": null,
"user_api_key_end_user_id": null,
"prompt_management_metadata": null,
"applied_guardrails": []
},
"cache_key": null,
"response_cost": 0.00022500000000000002,
"total_tokens": 30,
"prompt_tokens": 10,
"completion_tokens": 20,
"request_tags": [],
"end_user": "",
"api_base": "",
"model_group": "",
"model_id": "",
"requester_ip_address": null,
"messages": [
{
"role": "user",
"content": "Hello, world!"
}
],
"response": {
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
"created": 1742855151,
"model": "gpt-4o",
"object": "chat.completion",
"system_fingerprint": null,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "hi",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"usage": {
"completion_tokens": 20,
"prompt_tokens": 10,
"total_tokens": 30,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
},
"model_parameters": {},
"hidden_params": {
"model_id": null,
"cache_key": null,
"api_base": "https://api.openai.com",
"response_cost": 0.00022500000000000002,
"additional_headers": {},
"litellm_overhead_time_ms": null,
"batch_models": null,
"litellm_model_name": "gpt-4o"
},
"model_map_information": {
"model_map_key": "gpt-4o",
"model_map_value": {
"key": "gpt-4o",
"max_tokens": 16384,
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"input_cost_per_token": 2.5e-06,
"cache_creation_input_token_cost": null,
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_character": null,
"input_cost_per_token_above_128k_tokens": null,
"input_cost_per_query": null,
"input_cost_per_second": null,
"input_cost_per_audio_token": null,
"input_cost_per_token_batches": 1.25e-06,
"output_cost_per_token_batches": 5e-06,
"output_cost_per_token": 1e-05,
"output_cost_per_audio_token": null,
"output_cost_per_character": null,
"output_cost_per_token_above_128k_tokens": null,
"output_cost_per_character_above_128k_tokens": null,
"output_cost_per_second": null,
"output_cost_per_image": null,
"output_vector_size": null,
"litellm_provider": "openai",
"mode": "chat",
"supports_system_messages": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"supports_assistant_prefill": false,
"supports_prompt_caching": true,
"supports_audio_input": false,
"supports_audio_output": false,
"supports_pdf_input": false,
"supports_embedding_image_input": false,
"supports_native_streaming": null,
"supports_web_search": true,
"search_context_cost_per_query": {
"search_context_size_low": 0.03,
"search_context_size_medium": 0.035,
"search_context_size_high": 0.05
},
"tpm": null,
"rpm": null,
"supported_openai_params": [
"frequency_penalty",
"logit_bias",
"logprobs",
"top_logprobs",
"max_tokens",
"max_completion_tokens",
"modalities",
"prediction",
"n",
"presence_penalty",
"seed",
"stop",
"stream",
"stream_options",
"temperature",
"top_p",
"tools",
"tool_choice",
"function_call",
"functions",
"max_retries",
"extra_headers",
"parallel_tool_calls",
"audio",
"response_format",
"user"
]
}
},
"error_str": null,
"error_information": {
"error_code": "",
"error_class": "",
"llm_provider": "",
"traceback": "",
"error_message": ""
},
"response_cost_failure_debug_info": null,
"guardrail_information": null,
"standard_built_in_tools_params": {
"web_search_options": null,
"file_search": null
}
}

View file

@ -0,0 +1,151 @@
import os
import sys
import traceback
import uuid
import pytest
from dotenv import load_dotenv
from fastapi import Request
from fastapi.routing import APIRoute
load_dotenv()
import io
import os
import time
import json
# this file is to test litellm/proxy
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
import asyncio
from typing import Optional
from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
from litellm.integrations.custom_logger import CustomLogger
class TestCustomLogger(CustomLogger):
def __init__(self):
self.recorded_usage: Optional[Usage] = None
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
standard_logging_payload = kwargs.get("standard_logging_object")
self.standard_logging_payload = standard_logging_payload
print(
"standard_logging_payload",
json.dumps(standard_logging_payload, indent=4, default=str),
)
self.recorded_usage = Usage(
prompt_tokens=standard_logging_payload.get("prompt_tokens"),
completion_tokens=standard_logging_payload.get("completion_tokens"),
total_tokens=standard_logging_payload.get("total_tokens"),
)
pass
async def _setup_web_search_test():
"""Helper function to setup common test requirements"""
litellm._turn_on_debug()
test_custom_logger = TestCustomLogger()
litellm.callbacks = [test_custom_logger]
return test_custom_logger
async def _verify_web_search_cost(test_custom_logger, expected_context_size):
"""Helper function to verify web search costs"""
await asyncio.sleep(1)
standard_logging_payload = test_custom_logger.standard_logging_payload
response_cost = standard_logging_payload.get("response_cost")
assert response_cost is not None
# Calculate token cost
model_map_information = standard_logging_payload["model_map_information"]
model_map_value: ModelInfoBase = model_map_information["model_map_value"]
total_token_cost = (
standard_logging_payload["prompt_tokens"]
* model_map_value["input_cost_per_token"]
) + (
standard_logging_payload["completion_tokens"]
* model_map_value["output_cost_per_token"]
)
# Verify total cost
assert (
response_cost
== total_token_cost
+ model_map_value["search_context_cost_per_query"][expected_context_size]
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"web_search_options,expected_context_size",
[
(None, "search_context_size_medium"),
({"search_context_size": "low"}, "search_context_size_low"),
({"search_context_size": "high"}, "search_context_size_high"),
],
)
async def test_openai_web_search_logging_cost_tracking(
web_search_options, expected_context_size
):
"""Test web search cost tracking with different search context sizes"""
test_custom_logger = await _setup_web_search_test()
request_kwargs = {
"model": "openai/gpt-4o-search-preview",
"messages": [
{"role": "user", "content": "What was a positive news story from today?"}
],
}
if web_search_options is not None:
request_kwargs["web_search_options"] = web_search_options
response = await litellm.acompletion(**request_kwargs)
await _verify_web_search_cost(test_custom_logger, expected_context_size)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"tools_config,expected_context_size,stream",
[
(
[{"type": "web_search_preview", "search_context_size": "low"}],
"search_context_size_low",
True,
),
(
[{"type": "web_search_preview", "search_context_size": "low"}],
"search_context_size_low",
False,
),
([{"type": "web_search_preview"}], "search_context_size_medium", True),
([{"type": "web_search_preview"}], "search_context_size_medium", False),
],
)
async def test_openai_responses_api_web_search_cost_tracking(
tools_config, expected_context_size, stream
):
"""Test web search cost tracking with different search context sizes and streaming options"""
test_custom_logger = await _setup_web_search_test()
response = await litellm.aresponses(
model="openai/gpt-4o",
input=[
{"role": "user", "content": "What was a positive news story from today?"}
],
tools=tools_config,
stream=stream,
)
if stream is True:
async for chunk in response:
print("chunk", chunk)
else:
print("response", response)
await _verify_web_search_cost(test_custom_logger, expected_context_size)

View file

@ -6,6 +6,7 @@ import sys
sys.path.insert(0, os.path.abspath("../.."))
import asyncio
import litellm
import gzip
import json
import logging
@ -48,8 +49,15 @@ def assert_gcs_pubsub_request_matches_expected(
expected_request_body = json.load(f)
# Replace dynamic values in actual request body
time_fields = ["startTime", "endTime", "completionStartTime", "request_id"]
for field in time_fields:
dynamic_fields = [
"startTime",
"endTime",
"completionStartTime",
"request_id",
"id",
"response_time",
]
for field in dynamic_fields:
if field in actual_request_body:
actual_request_body[field] = expected_request_body[field]
@ -59,6 +67,55 @@ def assert_gcs_pubsub_request_matches_expected(
), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"
def assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
actual_request_body: dict,
expected_file_name: str,
):
"""
Helper function to compare actual GCS PubSub request body with expected JSON file.
Args:
actual_request_body (dict): The actual request body received from the API call
expected_file_name (str): Name of the JSON file containing expected request body
"""
# Get the current directory and read the expected request body
pwd = os.path.dirname(os.path.realpath(__file__))
expected_body_path = os.path.join(pwd, "gcs_pub_sub_body", expected_file_name)
with open(expected_body_path, "r") as f:
expected_request_body = json.load(f)
# Replace dynamic values in actual request body
FIELDS_TO_VALIDATE = [
"custom_llm_provider",
"hidden_params",
"messages",
"response",
"model",
"status",
"stream",
]
actual_request_body["response"]["id"] = expected_request_body["response"]["id"]
actual_request_body["response"]["created"] = expected_request_body["response"][
"created"
]
for field in FIELDS_TO_VALIDATE:
assert field in actual_request_body
FIELDS_EXISTENCE_CHECKS = [
"response_cost",
"response_time",
"completion_tokens",
"prompt_tokens",
"total_tokens",
]
for field in FIELDS_EXISTENCE_CHECKS:
assert field in actual_request_body
@pytest.mark.asyncio
async def test_async_gcs_pub_sub():
# Create a mock for the async_httpx_client's post method
@ -102,6 +159,61 @@ async def test_async_gcs_pub_sub():
decoded_message = base64.b64decode(encoded_message).decode("utf-8")
# Parse the JSON string into a dictionary
actual_request = json.loads(decoded_message)
print("##########\n")
print(json.dumps(actual_request, indent=4))
print("##########\n")
# Verify the request body matches expected format
assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
actual_request, "standard_logging_payload.json"
)
@pytest.mark.asyncio
async def test_async_gcs_pub_sub_v1():
# Create a mock for the async_httpx_client's post method
litellm.gcs_pub_sub_use_v1 = True
mock_post = AsyncMock()
mock_post.return_value.status_code = 202
mock_post.return_value.text = "Accepted"
# Initialize the GcsPubSubLogger and set the mock
gcs_pub_sub_logger = GcsPubSubLogger(flush_interval=1)
gcs_pub_sub_logger.async_httpx_client.post = mock_post
mock_construct_request_headers = AsyncMock()
mock_construct_request_headers.return_value = {"Authorization": "Bearer mock_token"}
gcs_pub_sub_logger.construct_request_headers = mock_construct_request_headers
litellm.callbacks = [gcs_pub_sub_logger]
# Make the completion call
response = await litellm.acompletion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello, world!"}],
mock_response="hi",
)
await asyncio.sleep(3) # Wait for async flush
# Assert httpx post was called
mock_post.assert_called_once()
# Get the actual request body from the mock
actual_url = mock_post.call_args[1]["url"]
print("sent to url", actual_url)
assert (
actual_url
== "https://pubsub.googleapis.com/v1/projects/reliableKeys/topics/litellmDB:publish"
)
actual_request = mock_post.call_args[1]["json"]
# Extract and decode the base64 encoded message
encoded_message = actual_request["messages"][0]["data"]
import base64
decoded_message = base64.b64decode(encoded_message).decode("utf-8")
# Parse the JSON string into a dictionary
actual_request = json.loads(decoded_message)
print("##########\n")

View file

@ -21,16 +21,18 @@ sys.path.insert(
import litellm
import asyncio
from typing import Optional
from litellm.types.utils import StandardLoggingPayload, Usage
from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
from litellm.integrations.custom_logger import CustomLogger
class TestCustomLogger(CustomLogger):
def __init__(self):
self.recorded_usage: Optional[Usage] = None
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
standard_logging_payload = kwargs.get("standard_logging_object")
self.standard_logging_payload = standard_logging_payload
print(
"standard_logging_payload",
json.dumps(standard_logging_payload, indent=4, default=str),