Merge branch 'main' into litellm_exp_mcp_server

This commit is contained in:
Ishaan Jaff 2025-03-24 19:03:56 -07:00
commit c6424d6246
58 changed files with 2991 additions and 627 deletions

View file

@ -1855,7 +1855,7 @@ jobs:
command: | command: |
docker run -d \ docker run -d \
-p 4000:4000 \ -p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \ -e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
-e STORE_MODEL_IN_DB="True" \ -e STORE_MODEL_IN_DB="True" \
-e LITELLM_MASTER_KEY="sk-1234" \ -e LITELLM_MASTER_KEY="sk-1234" \
-e LITELLM_LICENSE=$LITELLM_LICENSE \ -e LITELLM_LICENSE=$LITELLM_LICENSE \

View file

@ -4,7 +4,8 @@ python-dotenv
tiktoken tiktoken
importlib_metadata importlib_metadata
cohere cohere
redis redis==5.2.1
redisvl==0.4.1
anthropic anthropic
orjson==3.9.15 orjson==3.9.15
pydantic==2.10.2 pydantic==2.10.2

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
.python-version
.venv .venv
.env .env
.newenv .newenv

View file

@ -37,9 +37,6 @@ RUN pip install dist/*.whl
# install dependencies as wheels # install dependencies as wheels
RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# ensure pyjwt is used, not jwt # ensure pyjwt is used, not jwt
RUN pip uninstall jwt -y RUN pip uninstall jwt -y
RUN pip uninstall PyJWT -y RUN pip uninstall PyJWT -y

View file

@ -1,35 +1,5 @@
version: "3.11" version: "3.11"
services: services:
litellm:
build:
context: .
args:
target: runtime
image: ghcr.io/berriai/litellm:main-stable
#########################################
## Uncomment these lines to start proxy with a config.yaml file ##
# volumes:
# - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
# command:
# - "--config=/app/config.yaml"
##############################################
ports:
- "4000:4000" # Map the container port to the host, change the host port if necessary
environment:
DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file:
- .env # Load local .env file
depends_on:
- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
healthcheck: # Defines the health check configuration for the container
test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ] # Command to execute for health check
interval: 30s # Perform health check every 30 seconds
timeout: 10s # Health check command times out after 10 seconds
retries: 3 # Retry up to 3 times if health check fails
start_period: 40s # Wait 40 seconds after container start before beginning health checks
db: db:
image: postgres:16 image: postgres:16
restart: always restart: always
@ -46,25 +16,3 @@ services:
interval: 1s interval: 1s
timeout: 5s timeout: 5s
retries: 10 retries: 10
prometheus:
image: prom/prometheus
volumes:
- prometheus_data:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
restart: always
volumes:
prometheus_data:
driver: local
postgres_data:
name: litellm_postgres_data # Named volume for Postgres data persistence
# ...rest of your docker-compose config if any

View file

@ -59,9 +59,6 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file # Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# ensure pyjwt is used, not jwt # ensure pyjwt is used, not jwt
RUN pip uninstall jwt -y RUN pip uninstall jwt -y
RUN pip uninstall PyJWT -y RUN pip uninstall PyJWT -y

View file

@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Install build dependencies # Install build dependencies
RUN apt-get clean && apt-get update && \ RUN apt-get clean && apt-get update && \
apt-get install -y gcc python3-dev && \ apt-get install -y gcc g++ python3-dev && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir --upgrade pip && \ RUN pip install --no-cache-dir --upgrade pip && \
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file # Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
# ensure pyjwt is used, not jwt # ensure pyjwt is used, not jwt
RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \ RUN pip uninstall jwt -y && \
pip uninstall jwt -y && \
pip uninstall PyJWT -y && \ pip uninstall PyJWT -y && \
pip install PyJWT==2.9.0 --no-cache-dir pip install PyJWT==2.9.0 --no-cache-dir

View file

@ -26,7 +26,7 @@ Install redis
pip install redis pip install redis
``` ```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
```python ```python
import litellm import litellm
@ -91,12 +91,12 @@ response2 = completion(
<TabItem value="redis-sem" label="redis-semantic cache"> <TabItem value="redis-sem" label="redis-semantic cache">
Install redis Install redisvl client
```shell ```shell
pip install redisvl==0.0.7 pip install redisvl==0.4.1
``` ```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
```python ```python
import litellm import litellm
@ -114,6 +114,7 @@ litellm.cache = Cache(
port=os.environ["REDIS_PORT"], port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"], password=os.environ["REDIS_PASSWORD"],
similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
ttl=120,
redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
) )
response1 = completion( response1 = completion(
@ -471,11 +472,13 @@ def __init__(
password: Optional[str] = None, password: Optional[str] = None,
namespace: Optional[str] = None, namespace: Optional[str] = None,
default_in_redis_ttl: Optional[float] = None, default_in_redis_ttl: Optional[float] = None,
similarity_threshold: Optional[float] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None, redis_flush_size=None,
# redis semantic cache params
similarity_threshold: Optional[float] = None,
redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
redis_semantic_cache_index_name: Optional[str] = None,
# s3 Bucket, boto3 configuration # s3 Bucket, boto3 configuration
s3_bucket_name: Optional[str] = None, s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None, s3_region_name: Optional[str] = None,

View file

@ -200,3 +200,92 @@ Expected Response
</TabItem> </TabItem>
</Tabs> </Tabs>
## OpenAI 'file' message type
This is currently only supported for OpenAI models.
This will be supported for all providers soon.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import base64
from litellm import completion
with open("draconomicon.pdf", "rb") as f:
data = f.read()
base64_string = base64.b64encode(data).decode("utf-8")
completion = completion(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
},
{
"type": "text",
"text": "What is the first dragon in the book?",
}
],
},
],
)
print(completion.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: openai-model
litellm_params:
model: gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "openai-model",
"messages": [
{"role": "user", "content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
}
]}
]
}'
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,308 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Using Web Search
Use web search with litellm
| Feature | Details |
|---------|---------|
| Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
| Supported Providers | `openai` |
| LiteLLM Cost Tracking | ✅ Supported |
| LiteLLM Version | `v1.63.15-nightly` or higher |
## `/chat/completions` (litellm.completion)
### Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import completion
response = completion(
model="openai/gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?",
}
],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4o-search-preview
litellm_params:
model: openai/gpt-4o-search-preview
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
]
)
```
</TabItem>
</Tabs>
### Search context size
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import completion
# Customize search context size
response = completion(
model="openai/gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?",
}
],
web_search_options={
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
# Customize search context size
response = client.chat.completions.create(
model="gpt-4o-search-preview",
messages=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
web_search_options={
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}
)
```
</TabItem>
</Tabs>
## `/responses` (litellm.responses)
### Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import responses
response = responses(
model="openai/gpt-4o",
input=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
tools=[{
"type": "web_search_preview" # enables web search with default medium context size
}]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
response = client.responses.create(
model="gpt-4o",
tools=[{
"type": "web_search_preview"
}],
input="What was a positive news story from today?",
)
print(response.output_text)
```
</TabItem>
</Tabs>
### Search context size
<Tabs>
<TabItem value="sdk" label="SDK">
```python showLineNumbers
from litellm import responses
# Customize search context size
response = responses(
model="openai/gpt-4o",
input=[
{
"role": "user",
"content": "What was a positive news story from today?"
}
],
tools=[{
"type": "web_search_preview",
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```python showLineNumbers
from openai import OpenAI
# Point to your proxy server
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:4000"
)
# Customize search context size
response = client.responses.create(
model="gpt-4o",
tools=[{
"type": "web_search_preview",
"search_context_size": "low" # Options: "low", "medium" (default), "high"
}],
input="What was a positive news story from today?",
)
print(response.output_text)
```
</TabItem>
</Tabs>
## Checking if a model supports web search
<Tabs>
<TabItem label="SDK" value="sdk">
Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
```python showLineNumbers
assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
```
</TabItem>
<TabItem label="PROXY" value="proxy">
1. Define OpenAI models in config.yaml
```yaml
model_list:
- model_name: gpt-4o-search-preview
litellm_params:
model: openai/gpt-4o-search-preview
api_key: os.environ/OPENAI_API_KEY
model_info:
supports_web_search: True
```
2. Run proxy server
```bash
litellm --config config.yaml
```
3. Call `/model_group/info` to check if a model supports web search
```shell
curl -X 'GET' \
'http://localhost:4000/model_group/info' \
-H 'accept: application/json' \
-H 'x-api-key: sk-1234'
```
Expected Response
```json showLineNumbers
{
"data": [
{
"model_group": "gpt-4o-search-preview",
"providers": ["openai"],
"max_tokens": 128000,
"supports_web_search": true, # 👈 supports_web_search is true
}
]
}
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,66 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# SSL Security Settings
If you're in an environment using an older TTS bundle, with an older encryption, follow this guide.
LiteLLM uses HTTPX for network requests, unless otherwise specified.
1. Disable SSL verification
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.ssl_verify = False
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
litellm_settings:
ssl_verify: false
```
</TabItem>
<TabItem value="env_var" label="Environment Variables">
```bash
export SSL_VERIFY="False"
```
</TabItem>
</Tabs>
2. Lower security settings
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.ssl_security_level = 1
litellm.ssl_certificate = "/path/to/certificate.pem"
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
litellm_settings:
ssl_security_level: 1
ssl_certificate: "/path/to/certificate.pem"
```
</TabItem>
<TabItem value="env_var" label="Environment Variables">
```bash
export SSL_SECURITY_LEVEL="1"
export SSL_CERTIFICATE="/path/to/certificate.pem"
```
</TabItem>
</Tabs>

View file

@ -1,4 +1,7 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Arize AI # Arize AI
@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm
::: :::
<Image img={require('../../img/arize.png')} />
## Pre-Requisites ## Pre-Requisites
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
```python ```python
litellm.callbacks = ["arize"] litellm.callbacks = ["arize"]
``` ```
```python ```python
import litellm import litellm
import os import os
@ -48,7 +55,7 @@ response = litellm.completion(
### Using with LiteLLM Proxy ### Using with LiteLLM Proxy
1. Setup config.yaml
```yaml ```yaml
model_list: model_list:
- model_name: gpt-4 - model_name: gpt-4
@ -60,13 +67,134 @@ model_list:
litellm_settings: litellm_settings:
callbacks: ["arize"] callbacks: ["arize"]
general_settings:
master_key: "sk-1234" # can also be set as an environment variable
environment_variables: environment_variables:
ARIZE_SPACE_KEY: "d0*****" ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****" ARIZE_API_KEY: "141a****"
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
``` ```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
```
## Pass Arize Space/Key per-request
Supported parameters:
- `arize_api_key`
- `arize_space_key`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set arize as a callback, litellm will send the data to arize
litellm.callbacks = ["arize"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
general_settings:
master_key: "sk-1234" # can also be set as an environment variable
```
2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
<Tabs>
<TabItem value="curl" label="CURL">
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
"arize_space_key": "ARIZE_SPACE_2_KEY"
}'
```
</TabItem>
<TabItem value="openai_python" label="OpenAI Python">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
"arize_space_key": "ARIZE_SPACE_2_KEY"
}
)
print(response)
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
## Support & Talk to Founders ## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)

View file

@ -291,14 +291,15 @@ response = completion(
) )
``` ```
## Azure O1 Models ## O-Series Models
| Model Name | Function Call | Azure OpenAI O-Series models are supported on LiteLLM.
|---------------------|----------------------------------------------------|
| o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
| o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support. LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
**Automatic Routing**
<Tabs> <Tabs>
<TabItem value="sdk" label="SDK"> <TabItem value="sdk" label="SDK">
@ -306,32 +307,88 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
```python ```python
import litellm import litellm
litellm.enable_preview_features = True # 👈 KEY CHANGE litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
response = litellm.completion(
model="azure/<your deployment name>",
messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
stream=True
)
for chunk in response:
print(chunk)
``` ```
</TabItem> </TabItem>
<TabItem value="proxy" label="Proxy"> <TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml ```yaml
model_list: model_list:
- model_name: o1-mini - model_name: o3-mini
litellm_params: litellm_params:
model: azure/o1-mini model: azure/o3-model
api_base: "os.environ/AZURE_API_BASE" api_base: os.environ/AZURE_API_BASE
api_key: "os.environ/AZURE_API_KEY" api_key: os.environ/AZURE_API_KEY
api_version: "os.environ/AZURE_API_VERSION" ```
litellm_settings: </TabItem>
enable_preview_features: true # 👈 KEY CHANGE </Tabs>
**Explicit Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o_series/my-random-deployment-name
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
## Azure Audio Model
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
response = completion(
model="azure/azure-openai-4o-audio",
messages=[
{
"role": "user",
"content": "I want to try out speech to speech"
}
],
modalities=["text","audio"],
audio={"voice": "alloy", "format": "wav"}
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: azure-openai-4o-audio
litellm_params:
model: azure/azure-openai-4o-audio
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: os.environ/AZURE_API_VERSION
``` ```
2. Start proxy 2. Start proxy
@ -340,26 +397,22 @@ litellm_settings:
litellm --config /path/to/config.yaml litellm --config /path/to/config.yaml
``` ```
3. Test it 3. Test it!
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="o1-mini", messages = [ ```bash
{ curl http://localhost:4000/v1/chat/completions \
"role": "user", -H "Authorization: Bearer $LITELLM_API_KEY" \
"content": "this is a test request, write a short poem" -H "Content-Type: application/json" \
} -d '{
], "model": "azure-openai-4o-audio",
stream=True) "messages": [{"role": "user", "content": "I want to try out speech to speech"}],
"modalities": ["text","audio"],
for chunk in response: "audio": {"voice": "alloy", "format": "wav"}
print(chunk) }'
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>
@ -948,62 +1001,9 @@ Expected Response:
{"data":[{"id":"batch_R3V...} {"data":[{"id":"batch_R3V...}
``` ```
## O-Series Models
Azure OpenAI O-Series models are supported on LiteLLM.
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
**Automatic Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o3-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>
**Explicit Routing**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
model_list:
- model_name: o3-mini
litellm_params:
model: azure/o_series/my-random-deployment-name
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
```
</TabItem>
</Tabs>

View file

@ -1428,10 +1428,14 @@ response = litellm.embedding(
## Supported AWS Bedrock Models ## Supported AWS Bedrock Models
LiteLLM supports ALL Bedrock models.
Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
| Model Name | Command | | Model Name | Command |
|----------------------------|------------------------------------------------------------------| |----------------------------|------------------------------------------------------------------|
| Deepseek R1 | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |

View file

@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
</TabItem> </TabItem>
</Tabs> </Tabs>
## Using Ollama FIM on `/v1/completions`
LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
litellm._turn_on_debug() # turn on debug to see the request
from litellm import completion
response = completion(
model="ollama/llama3.1",
prompt="Hello, world!",
api_base="http://localhost:11434"
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: "llama3.1"
litellm_params:
model: "ollama/llama3.1"
api_base: "http://localhost:11434"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml --detailed_debug
# RUNNING ON http://0.0.0.0:4000
```
3. Test it!
```python
from openai import OpenAI
client = OpenAI(
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
)
response = client.completions.create(
model="ollama/llama3.1",
prompt="Hello, world!",
api_base="http://localhost:11434"
)
print(response)
```
</TabItem>
</Tabs>
## Using ollama `api/chat` ## Using ollama `api/chat`
In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat` In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

View file

@ -228,6 +228,92 @@ response = completion(
``` ```
## PDF File Parsing
OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import base64
from litellm import completion
with open("draconomicon.pdf", "rb") as f:
data = f.read()
base64_string = base64.b64encode(data).decode("utf-8")
completion = completion(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
},
{
"type": "text",
"text": "What is the first dragon in the book?",
}
],
},
],
)
print(completion.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: openai-model
litellm_params:
model: gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start the proxy
```bash
litellm --config config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "openai-model",
"messages": [
{"role": "user", "content": [
{
"type": "file",
"file": {
"filename": "draconomicon.pdf",
"file_data": f"data:application/pdf;base64,{base64_string}",
}
}
]}
]
}'
```
</TabItem>
</Tabs>
## OpenAI Fine Tuned Models ## OpenAI Fine Tuned Models
| Model Name | Function Call | | Model Name | Function Call |
@ -449,26 +535,6 @@ response = litellm.acompletion(
) )
``` ```
### Using Helicone Proxy with LiteLLM
```python
import os
import litellm
from litellm import completion
os.environ["OPENAI_API_KEY"] = ""
# os.environ["OPENAI_API_BASE"] = ""
litellm.api_base = "https://oai.hconeai.com/v1"
litellm.headers = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
"Helicone-Cache-Enabled": "true",
}
messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call
response = completion("gpt-3.5-turbo", messages)
```
### Using OpenAI Proxy with LiteLLM ### Using OpenAI Proxy with LiteLLM
```python ```python

View file

@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
import os import os
from litellm import completion from litellm import completion
os.environ["OPENROUTER_API_KEY"] = "" os.environ["OPENROUTER_API_KEY"] = ""
os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1
os.environ["OR_SITE_URL"] = "" # optional
os.environ["OR_APP_NAME"] = "" # optional os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
os.environ["OR_APP_NAME"] = "" # [OPTIONAL]
response = completion( response = completion(
model="openrouter/google/palm-2-chat-bison", model="openrouter/google/palm-2-chat-bison",

View file

@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
response: str, response: str,
): ):
pass pass
aasync def async_post_call_streaming_iterator_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_data: dict,
) -> AsyncGenerator[ModelResponseStream, None]:
"""
Passes the entire stream to the guardrail
This is useful for plugins that need to see the entire stream.
"""
async for item in response:
yield item
proxy_handler_instance = MyCustomHandler() proxy_handler_instance = MyCustomHandler()
``` ```

View file

@ -147,6 +147,7 @@ general_settings:
|------|------|-------------| |------|------|-------------|
| completion_model | string | The default model to use for completions when `model` is not specified in the request | | completion_model | string | The default model to use for completions when `model` is not specified in the request |
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database | | disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) | | disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached | | disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task | | disable_reset_budget | boolean | If true, turns off reset budget scheduled task |

View file

@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail
### 1. Write a `CustomGuardrail` Class ### 1. Write a `CustomGuardrail` Class
A CustomGuardrail has 3 methods to enforce guardrails A CustomGuardrail has 4 methods to enforce guardrails
- `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call - `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
- `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency) - `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
- `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call - `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
**[See detailed spec of methods here](#customguardrail-methods)** **[See detailed spec of methods here](#customguardrail-methods)**
@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
): ):
raise ValueError("Guardrail failed Coffee Detected") raise ValueError("Guardrail failed Coffee Detected")
async def async_post_call_streaming_iterator_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_data: dict,
) -> AsyncGenerator[ModelResponseStream, None]:
"""
Passes the entire stream to the guardrail
This is useful for guardrails that need to see the entire response, such as PII masking.
See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
Triggered by mode: 'post_call'
"""
async for item in response:
yield item
``` ```

View file

@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
| `response_cost` | `Optional[str]` | Optional response cost | | `response_cost` | `Optional[str]` | Optional response cost |
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers | | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation | | `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
| `litellm_model_name` | `Optional[str]` | Model name sent in request |
## StandardLoggingModelInformation ## StandardLoggingModelInformation

View file

@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
| `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed | | `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
## Cost Tracking Headers ## Cost Tracking Headers
| Header | Type | Description | | Header | Type | Description | Available on Pass-Through Endpoints |
|--------|------|-------------| |--------|------|-------------|-------------|
| `x-litellm-response-cost` | float | Cost of the API call | | `x-litellm-response-cost` | float | Cost of the API call | |
| `x-litellm-key-spend` | float | Total spend for the API key | | `x-litellm-key-spend` | float | Total spend for the API key | ✅ |
## LiteLLM Specific Headers ## LiteLLM Specific Headers
| Header | Type | Description | | Header | Type | Description | Available on Pass-Through Endpoints |
|--------|------|-------------| |--------|------|-------------|-------------|
| `x-litellm-call-id` | string | Unique identifier for the API call | | `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
| `x-litellm-model-id` | string | Unique identifier for the model used | | `x-litellm-model-id` | string | Unique identifier for the model used | |
| `x-litellm-model-api-base` | string | Base URL of the API endpoint | | `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
| `x-litellm-version` | string | Version of LiteLLM being used | | `x-litellm-version` | string | Version of LiteLLM being used | |
| `x-litellm-model-group` | string | Model group identifier | | `x-litellm-model-group` | string | Model group identifier | |
## Response headers from LLM providers ## Response headers from LLM providers

Binary file not shown.

After

Width:  |  Height:  |  Size: 707 KiB

View file

@ -26,14 +26,6 @@ This release is primarily focused on:
- UI - Credential Management, re-use credentials when adding new models - UI - Credential Management, re-use credentials when adding new models
- UI - Test Connection to LLM Provider before adding a model - UI - Test Connection to LLM Provider before adding a model
:::info
This release will be live on 03/16/2025
:::
<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
## Known Issues ## Known Issues
- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test - 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test

View file

@ -0,0 +1,130 @@
---
title: v1.63.14-stable
slug: v1.63.14-stable
date: 2025-03-22T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: [credential management, thinking content, responses api, snowflake]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
These are the changes since `v1.63.11-stable`.
This release brings:
- LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
- Perf improvements for Usage-based Routing
- Streaming guardrail support via websockets
## Docker Run LiteLLM Proxy
```
docker run
-e STORE_MODEL_IN_DB=True
-p 4000:4000
ghcr.io/berriai/litellm:main-v1.63.14-stable
```
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
- Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361)
- O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397)
- Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
- Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
## LLM Translation
1. **New LLM Features**
- Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile)
- Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`)
- Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions)
- Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models)
- OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md)
- Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model)
- OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type)
- OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming)
- [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp)
2. **Bug Fixes**
- Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e)
- Sagemaker - Fix Too little data for declared Content-Length error - [PR](https://github.com/BerriAI/litellm/pull/9326)
- OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355)
- VertexAI - Embedding outputDimensionality support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304)
- Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437)
## Spend Tracking Improvements
- `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk
- Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329)
- Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314)
## UI
1. Users Page
- Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328)
2. Icons:
- Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374)
3. Sign In/Sign Out
- Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395)
## Logging Integrations
- Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class)
- Arize [Get Started](../../docs/observability/arize_integration)
- fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338)
- migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338)
- fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353)
- Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request)
- StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams)
- Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md)
## Performance / Reliability improvements
- Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf)
- Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331)
- Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
- Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
- Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support)
- Usage-based routing - Support batch writing increments to redis - reduces latency to same as simple-shuffle [PR](https://github.com/BerriAI/litellm/pull/9357)
- Router - show reason for model cooldown on no healthy deployments available error [PR](https://github.com/BerriAI/litellm/pull/9438)
- Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image urls being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448)
## General Improvements
- Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers)
- SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings)
- Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376)
- Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling)
- OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a)
- Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb)
## Complete Git Diff
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc)

View file

@ -243,7 +243,9 @@ const sidebars = {
"exception_mapping", "exception_mapping",
"completion/provider_specific_params", "completion/provider_specific_params",
"guides/finetuned_models", "guides/finetuned_models",
"guides/security_settings",
"completion/audio", "completion/audio",
"completion/web_search",
"completion/document_understanding", "completion/document_understanding",
"completion/vision", "completion/vision",
"completion/json_mode", "completion/json_mode",

View file

@ -122,6 +122,9 @@ langsmith_batch_size: Optional[int] = None
prometheus_initialize_budget_metrics: Optional[bool] = False prometheus_initialize_budget_metrics: Optional[bool] = False
argilla_batch_size: Optional[int] = None argilla_batch_size: Optional[int] = None
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
gcs_pub_sub_use_v1: Optional[bool] = (
False # if you want to use v1 gcs pubsub logged payload
)
argilla_transformation_object: Optional[Dict[str, Any]] = None argilla_transformation_object: Optional[Dict[str, Any]] = None
_async_input_callback: List[Union[str, Callable, CustomLogger]] = ( _async_input_callback: List[Union[str, Callable, CustomLogger]] = (
[] []
@ -756,6 +759,7 @@ from .utils import (
create_pretrained_tokenizer, create_pretrained_tokenizer,
create_tokenizer, create_tokenizer,
supports_function_calling, supports_function_calling,
supports_web_search,
supports_response_schema, supports_response_schema,
supports_parallel_function_calling, supports_parallel_function_calling,
supports_vision, supports_vision,

View file

@ -88,16 +88,16 @@ class Cache:
s3_aws_session_token: Optional[str] = None, s3_aws_session_token: Optional[str] = None,
s3_config: Optional[Any] = None, s3_config: Optional[Any] = None,
s3_path: Optional[str] = None, s3_path: Optional[str] = None,
redis_semantic_cache_use_async=False, redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
redis_semantic_cache_embedding_model="text-embedding-ada-002", redis_semantic_cache_index_name: Optional[str] = None,
redis_flush_size: Optional[int] = None, redis_flush_size: Optional[int] = None,
redis_startup_nodes: Optional[List] = None, redis_startup_nodes: Optional[List] = None,
disk_cache_dir=None, disk_cache_dir: Optional[str] = None,
qdrant_api_base: Optional[str] = None, qdrant_api_base: Optional[str] = None,
qdrant_api_key: Optional[str] = None, qdrant_api_key: Optional[str] = None,
qdrant_collection_name: Optional[str] = None, qdrant_collection_name: Optional[str] = None,
qdrant_quantization_config: Optional[str] = None, qdrant_quantization_config: Optional[str] = None,
qdrant_semantic_cache_embedding_model="text-embedding-ada-002", qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
**kwargs, **kwargs,
): ):
""" """
@ -170,8 +170,8 @@ class Cache:
port=port, port=port,
password=password, password=password,
similarity_threshold=similarity_threshold, similarity_threshold=similarity_threshold,
use_async=redis_semantic_cache_use_async,
embedding_model=redis_semantic_cache_embedding_model, embedding_model=redis_semantic_cache_embedding_model,
index_name=redis_semantic_cache_index_name,
**kwargs, **kwargs,
) )
elif type == LiteLLMCacheType.QDRANT_SEMANTIC: elif type == LiteLLMCacheType.QDRANT_SEMANTIC:

View file

@ -1,337 +1,437 @@
""" """
Redis Semantic Cache implementation Redis Semantic Cache implementation for LiteLLM
Has 4 methods: The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
- set_cache This cache stores responses based on the semantic similarity of prompts rather than
- get_cache exact matching, allowing for more flexible caching of LLM responses.
- async_set_cache
- async_get_cache This implementation uses RedisVL's SemanticCache to find semantically similar prompts
and their cached responses.
""" """
import ast import ast
import asyncio import asyncio
import json import json
from typing import Any import os
from typing import Any, Dict, List, Optional, Tuple
import litellm import litellm
from litellm._logging import print_verbose from litellm._logging import print_verbose
from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
from .base_cache import BaseCache from .base_cache import BaseCache
class RedisSemanticCache(BaseCache): class RedisSemanticCache(BaseCache):
"""
Redis-backed semantic cache for LLM responses.
This cache uses vector similarity to find semantically similar prompts that have been
previously sent to the LLM, allowing for cache hits even when prompts are not identical
but carry similar meaning.
"""
DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
def __init__( def __init__(
self, self,
host=None, host: Optional[str] = None,
port=None, port: Optional[str] = None,
password=None, password: Optional[str] = None,
redis_url=None, redis_url: Optional[str] = None,
similarity_threshold=None, similarity_threshold: Optional[float] = None,
use_async=False, embedding_model: str = "text-embedding-ada-002",
embedding_model="text-embedding-ada-002", index_name: Optional[str] = None,
**kwargs, **kwargs,
): ):
from redisvl.index import SearchIndex
print_verbose(
"redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
)
if similarity_threshold is None:
raise Exception("similarity_threshold must be provided, passed None")
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model
schema = {
"index": {
"name": "litellm_semantic_cache_index",
"prefix": "litellm",
"storage_type": "hash",
},
"fields": {
"text": [{"name": "response"}],
"vector": [
{
"name": "litellm_embedding",
"dims": 1536,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
}
],
},
}
if redis_url is None:
# if no url passed, check if host, port and password are passed, if not raise an Exception
if host is None or port is None or password is None:
# try checking env for host, port and password
import os
host = os.getenv("REDIS_HOST")
port = os.getenv("REDIS_PORT")
password = os.getenv("REDIS_PASSWORD")
if host is None or port is None or password is None:
raise Exception("Redis host, port, and password must be provided")
redis_url = "redis://:" + password + "@" + host + ":" + port
print_verbose(f"redis semantic-cache redis_url: {redis_url}")
if use_async is False:
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url)
try:
self.index.create(overwrite=False) # don't overwrite existing index
except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
elif use_async is True:
schema["index"]["name"] = "litellm_semantic_cache_index_async"
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url, use_async=True)
#
def _get_cache_logic(self, cached_response: Any):
""" """
Common 'get_cache_logic' across sync + async redis client implementations Initialize the Redis Semantic Cache.
Args:
host: Redis host address
port: Redis port
password: Redis password
redis_url: Full Redis URL (alternative to separate host/port/password)
similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
where 1.0 requires exact matches and 0.0 accepts any match
embedding_model: Model to use for generating embeddings
index_name: Name for the Redis index
ttl: Default time-to-live for cache entries in seconds
**kwargs: Additional arguments passed to the Redis client
Raises:
Exception: If similarity_threshold is not provided or required Redis
connection information is missing
"""
from redisvl.extensions.llmcache import SemanticCache
from redisvl.utils.vectorize import CustomTextVectorizer
if index_name is None:
index_name = self.DEFAULT_REDIS_INDEX_NAME
print_verbose(f"Redis semantic-cache initializing index - {index_name}")
# Validate similarity threshold
if similarity_threshold is None:
raise ValueError("similarity_threshold must be provided, passed None")
# Store configuration
self.similarity_threshold = similarity_threshold
# Convert similarity threshold [0,1] to distance threshold [0,2]
# For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
self.distance_threshold = 1 - similarity_threshold
self.embedding_model = embedding_model
# Set up Redis connection
if redis_url is None:
try:
# Attempt to use provided parameters or fallback to environment variables
host = host or os.environ['REDIS_HOST']
port = port or os.environ['REDIS_PORT']
password = password or os.environ['REDIS_PASSWORD']
except KeyError as e:
# Raise a more informative exception if any of the required keys are missing
missing_var = e.args[0]
raise ValueError(f"Missing required Redis configuration: {missing_var}. "
f"Provide {missing_var} or redis_url.") from e
redis_url = f"redis://:{password}@{host}:{port}"
print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
# Initialize the Redis vectorizer and cache
cache_vectorizer = CustomTextVectorizer(self._get_embedding)
self.llmcache = SemanticCache(
name=index_name,
redis_url=redis_url,
vectorizer=cache_vectorizer,
distance_threshold=self.distance_threshold,
overwrite=False,
)
def _get_ttl(self, **kwargs) -> Optional[int]:
"""
Get the TTL (time-to-live) value for cache entries.
Args:
**kwargs: Keyword arguments that may contain a custom TTL
Returns:
Optional[int]: The TTL value in seconds, or None if no TTL should be applied
"""
ttl = kwargs.get("ttl")
if ttl is not None:
ttl = int(ttl)
return ttl
def _get_embedding(self, prompt: str) -> List[float]:
"""
Generate an embedding vector for the given prompt using the configured embedding model.
Args:
prompt: The text to generate an embedding for
Returns:
List[float]: The embedding vector
"""
# Create an embedding from prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
embedding = embedding_response["data"][0]["embedding"]
return embedding
def _get_cache_logic(self, cached_response: Any) -> Any:
"""
Process the cached response to prepare it for use.
Args:
cached_response: The raw cached response
Returns:
The processed cache response, or None if input was None
""" """
if cached_response is None: if cached_response is None:
return cached_response return cached_response
# check if cached_response is bytes # Convert bytes to string if needed
if isinstance(cached_response, bytes): if isinstance(cached_response, bytes):
cached_response = cached_response.decode("utf-8") cached_response = cached_response.decode("utf-8")
# Convert string representation to Python object
try: try:
cached_response = json.loads( cached_response = json.loads(cached_response)
cached_response except json.JSONDecodeError:
) # Convert string to dictionary try:
except Exception: cached_response = ast.literal_eval(cached_response)
cached_response = ast.literal_eval(cached_response) except (ValueError, SyntaxError) as e:
return cached_response print_verbose(f"Error parsing cached response: {str(e)}")
def set_cache(self, key, value, **kwargs):
import numpy as np
print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
# get the prompt
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
# create an embedding for prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
# make the embedding a numpy array, convert to bytes
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
value = str(value)
assert isinstance(value, str)
new_data = [
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
]
# Add more data
self.index.load(new_data)
return
def get_cache(self, key, **kwargs):
print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
# query
# get the messages
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
# convert to embedding
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
query = VectorQuery(
vector=embedding,
vector_field_name="litellm_embedding",
return_fields=["response", "prompt", "vector_distance"],
num_results=1,
)
results = self.index.query(query)
if results is None:
return None
if isinstance(results, list):
if len(results) == 0:
return None return None
vector_distance = results[0]["vector_distance"] return cached_response
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# check similarity, if more than self.similarity_threshold, return results def set_cache(self, key: str, value: Any, **kwargs) -> None:
print_verbose( """
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}" Store a value in the semantic cache.
)
if similarity > self.similarity_threshold:
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass Args:
key: The cache key (not directly used in semantic caching)
async def async_set_cache(self, key, value, **kwargs): value: The response value to cache
import numpy as np **kwargs: Additional arguments including 'messages' for the prompt
and optional 'ttl' for time-to-live
from litellm.proxy.proxy_server import llm_model_list, llm_router """
print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")
try: try:
await self.index.acreate(overwrite=False) # don't overwrite existing index # Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic caching")
return
prompt = get_str_from_messages(messages)
value_str = str(value)
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
if ttl is not None:
self.llmcache.store(prompt, value_str, ttl=int(ttl))
else:
self.llmcache.store(prompt, value_str)
except Exception as e: except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}") print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")
print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
# get the prompt def get_cache(self, key: str, **kwargs) -> Any:
messages = kwargs["messages"] """
prompt = "".join(message["content"] for message in messages) Retrieve a semantically similar cached response.
# create an embedding for prompt
router_model_names = ( Args:
[m["model_name"] for m in llm_model_list] key: The cache key (not directly used in semantic caching)
if llm_model_list is not None **kwargs: Additional arguments including 'messages' for the prompt
else []
) Returns:
if llm_router is not None and self.embedding_model in router_model_names: The cached response if a semantically similar prompt is found, else None
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "") """
embedding_response = await llm_router.aembedding( print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")
model=self.embedding_model,
input=prompt, try:
cache={"no-store": True, "no-cache": True}, # Extract the prompt from messages
metadata={ messages = kwargs.get("messages", [])
"user_api_key": user_api_key, if not messages:
"semantic-cache-embedding": True, print_verbose("No messages provided for semantic cache lookup")
"trace_id": kwargs.get("metadata", {}).get("trace_id", None), return None
},
) prompt = get_str_from_messages(messages)
else: # Check the cache for semantically similar prompts
# convert to embedding results = self.llmcache.check(prompt=prompt)
embedding_response = await litellm.aembedding(
model=self.embedding_model, # Return None if no similar prompts found
input=prompt, if not results:
cache={"no-store": True, "no-cache": True}, return None
# Process the best matching result
cache_hit = results[0]
vector_distance = float(cache_hit["vector_distance"])
# Convert vector distance back to similarity score
# For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
similarity = 1 - vector_distance
cached_prompt = cache_hit["prompt"]
cached_response = cache_hit["response"]
print_verbose(
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
f"actual similarity: {similarity}, "
f"current prompt: {prompt}, "
f"cached prompt: {cached_prompt}"
) )
# get the embedding return self._get_cache_logic(cached_response=cached_response)
embedding = embedding_response["data"][0]["embedding"] except Exception as e:
print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
# make the embedding a numpy array, convert to bytes async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes() """
value = str(value) Asynchronously generate an embedding for the given prompt.
assert isinstance(value, str)
new_data = [ Args:
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes} prompt: The text to generate an embedding for
] **kwargs: Additional arguments that may contain metadata
# Add more data
await self.index.aload(new_data)
return
async def async_get_cache(self, key, **kwargs):
print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
Returns:
List[float]: The embedding vector
"""
from litellm.proxy.proxy_server import llm_model_list, llm_router from litellm.proxy.proxy_server import llm_model_list, llm_router
# query # Route the embedding request through the proxy if appropriate
# get the messages
messages = kwargs["messages"]
prompt = "".join(message["content"] for message in messages)
router_model_names = ( router_model_names = (
[m["model_name"] for m in llm_model_list] [m["model_name"] for m in llm_model_list]
if llm_model_list is not None if llm_model_list is not None
else [] else []
) )
if llm_router is not None and self.embedding_model in router_model_names:
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# convert to embedding
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding try:
embedding = embedding_response["data"][0]["embedding"] if llm_router is not None and self.embedding_model in router_model_names:
# Use the router for embedding generation
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# Generate embedding directly
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
query = VectorQuery( # Extract and return the embedding vector
vector=embedding, return embedding_response["data"][0]["embedding"]
vector_field_name="litellm_embedding", except Exception as e:
return_fields=["response", "prompt", "vector_distance"], print_verbose(f"Error generating async embedding: {str(e)}")
) raise ValueError(f"Failed to generate embedding: {str(e)}") from e
results = await self.index.aquery(query)
if results is None: async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 """
return None Asynchronously store a value in the semantic cache.
if isinstance(results, list):
if len(results) == 0: Args:
key: The cache key (not directly used in semantic caching)
value: The response value to cache
**kwargs: Additional arguments including 'messages' for the prompt
and optional 'ttl' for time-to-live
"""
print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
try:
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic caching")
return
prompt = get_str_from_messages(messages)
value_str = str(value)
# Generate embedding for the value (response) to cache
prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
if ttl is not None:
await self.llmcache.astore(
prompt,
value_str,
vector=prompt_embedding, # Pass through custom embedding
ttl=ttl
)
else:
await self.llmcache.astore(
prompt,
value_str,
vector=prompt_embedding # Pass through custom embedding
)
except Exception as e:
print_verbose(f"Error in async_set_cache: {str(e)}")
async def async_get_cache(self, key: str, **kwargs) -> Any:
"""
Asynchronously retrieve a semantically similar cached response.
Args:
key: The cache key (not directly used in semantic caching)
**kwargs: Additional arguments including 'messages' for the prompt
Returns:
The cached response if a semantically similar prompt is found, else None
"""
print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
try:
# Extract the prompt from messages
messages = kwargs.get("messages", [])
if not messages:
print_verbose("No messages provided for semantic cache lookup")
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
return None return None
vector_distance = results[0]["vector_distance"] prompt = get_str_from_messages(messages)
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# check similarity, if more than self.similarity_threshold, return results # Generate embedding for the prompt
print_verbose( prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
)
# update kwargs["metadata"] with similarity, don't rewrite the original metadata # Check the cache for semantically similar prompts
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity results = await self.llmcache.acheck(
prompt=prompt,
if similarity > self.similarity_threshold: vector=prompt_embedding
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
) )
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass
async def _index_info(self): # handle results / cache hit
return await self.index.ainfo() if not results:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
return None
async def async_set_cache_pipeline(self, cache_list, **kwargs): cache_hit = results[0]
tasks = [] vector_distance = float(cache_hit["vector_distance"])
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs)) # Convert vector distance back to similarity
await asyncio.gather(*tasks) # For cosine distance: 0 = most similar, 2 = least similar
# While similarity: 1 = most similar, 0 = least similar
similarity = 1 - vector_distance
cached_prompt = cache_hit["prompt"]
cached_response = cache_hit["response"]
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
print_verbose(
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
f"actual similarity: {similarity}, "
f"current prompt: {prompt}, "
f"cached prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_response)
except Exception as e:
print_verbose(f"Error in async_get_cache: {str(e)}")
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
async def _index_info(self) -> Dict[str, Any]:
"""
Get information about the Redis index.
Returns:
Dict[str, Any]: Information about the Redis index
"""
aindex = await self.llmcache._get_async_index()
return await aindex.info()
async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
"""
Asynchronously store multiple values in the semantic cache.
Args:
cache_list: List of (key, value) tuples to cache
**kwargs: Additional arguments
"""
try:
tasks = []
for val in cache_list:
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
await asyncio.gather(*tasks)
except Exception as e:
print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")

View file

@ -9,6 +9,9 @@ from pydantic import BaseModel
import litellm import litellm
import litellm._logging import litellm._logging
from litellm import verbose_logger from litellm import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
from litellm.llms.anthropic.cost_calculation import ( from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token, cost_per_token as anthropic_cost_per_token,
@ -57,6 +60,7 @@ from litellm.types.utils import (
LlmProvidersSet, LlmProvidersSet,
ModelInfo, ModelInfo,
PassthroughCallTypes, PassthroughCallTypes,
StandardBuiltInToolsParams,
Usage, Usage,
) )
from litellm.utils import ( from litellm.utils import (
@ -524,6 +528,7 @@ def completion_cost( # noqa: PLR0915
optional_params: Optional[dict] = None, optional_params: Optional[dict] = None,
custom_pricing: Optional[bool] = None, custom_pricing: Optional[bool] = None,
base_model: Optional[str] = None, base_model: Optional[str] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float: ) -> float:
""" """
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm. Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -802,6 +807,12 @@ def completion_cost( # noqa: PLR0915
rerank_billed_units=rerank_billed_units, rerank_billed_units=rerank_billed_units,
) )
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
_final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=completion_response,
standard_built_in_tools_params=standard_built_in_tools_params,
custom_llm_provider=custom_llm_provider,
)
return _final_cost return _final_cost
except Exception as e: except Exception as e:
@ -861,6 +872,7 @@ def response_cost_calculator(
base_model: Optional[str] = None, base_model: Optional[str] = None,
custom_pricing: Optional[bool] = None, custom_pricing: Optional[bool] = None,
prompt: str = "", prompt: str = "",
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float: ) -> float:
""" """
Returns Returns
@ -890,6 +902,7 @@ def response_cost_calculator(
custom_pricing=custom_pricing, custom_pricing=custom_pricing,
base_model=base_model, base_model=base_model,
prompt=prompt, prompt=prompt,
standard_built_in_tools_params=standard_built_in_tools_params,
) )
return response_cost return response_cost
except Exception as e: except Exception as e:

View file

@ -10,13 +10,16 @@ import asyncio
import json import json
import os import os
import traceback import traceback
from typing import TYPE_CHECKING, Any, Dict, List, Optional from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from litellm.types.utils import StandardLoggingPayload
if TYPE_CHECKING: if TYPE_CHECKING:
from litellm.proxy._types import SpendLogsPayload from litellm.proxy._types import SpendLogsPayload
else: else:
SpendLogsPayload = Any SpendLogsPayload = Any
import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
@ -61,7 +64,7 @@ class GcsPubSubLogger(CustomBatchLogger):
self.flush_lock = asyncio.Lock() self.flush_lock = asyncio.Lock()
super().__init__(**kwargs, flush_lock=self.flush_lock) super().__init__(**kwargs, flush_lock=self.flush_lock)
asyncio.create_task(self.periodic_flush()) asyncio.create_task(self.periodic_flush())
self.log_queue: List[SpendLogsPayload] = [] self.log_queue: List[Union[SpendLogsPayload, StandardLoggingPayload]] = []
async def construct_request_headers(self) -> Dict[str, str]: async def construct_request_headers(self) -> Dict[str, str]:
"""Construct authorization headers using Vertex AI auth""" """Construct authorization headers using Vertex AI auth"""
@ -115,13 +118,20 @@ class GcsPubSubLogger(CustomBatchLogger):
verbose_logger.debug( verbose_logger.debug(
"PubSub: Logging - Enters logging function for model %s", kwargs "PubSub: Logging - Enters logging function for model %s", kwargs
) )
spend_logs_payload = get_logging_payload( standard_logging_payload = kwargs.get("standard_logging_object", None)
kwargs=kwargs,
response_obj=response_obj, # Backwards compatibility with old logging payload
start_time=start_time, if litellm.gcs_pub_sub_use_v1 is True:
end_time=end_time, spend_logs_payload = get_logging_payload(
) kwargs=kwargs,
self.log_queue.append(spend_logs_payload) response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
self.log_queue.append(spend_logs_payload)
else:
# New logging payload, StandardLoggingPayload
self.log_queue.append(standard_logging_payload)
if len(self.log_queue) >= self.batch_size: if len(self.log_queue) >= self.batch_size:
await self.async_send_batch() await self.async_send_batch()
@ -155,7 +165,7 @@ class GcsPubSubLogger(CustomBatchLogger):
self.log_queue.clear() self.log_queue.clear()
async def publish_message( async def publish_message(
self, message: SpendLogsPayload self, message: Union[SpendLogsPayload, StandardLoggingPayload]
) -> Optional[Dict[str, Any]]: ) -> Optional[Dict[str, Any]]:
""" """
Publish message to Google Cloud Pub/Sub using REST API Publish message to Google Cloud Pub/Sub using REST API

View file

@ -35,6 +35,9 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.mlflow import MlflowLogger from litellm.integrations.mlflow import MlflowLogger
from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
from litellm.litellm_core_utils.get_litellm_params import get_litellm_params from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.litellm_core_utils.redact_messages import ( from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_custom_logger, redact_message_input_output_from_custom_logger,
@ -60,6 +63,7 @@ from litellm.types.utils import (
ModelResponse, ModelResponse,
ModelResponseStream, ModelResponseStream,
RawRequestTypedDict, RawRequestTypedDict,
StandardBuiltInToolsParams,
StandardCallbackDynamicParams, StandardCallbackDynamicParams,
StandardLoggingAdditionalHeaders, StandardLoggingAdditionalHeaders,
StandardLoggingHiddenParams, StandardLoggingHiddenParams,
@ -264,7 +268,9 @@ class Logging(LiteLLMLoggingBaseClass):
self.standard_callback_dynamic_params: StandardCallbackDynamicParams = ( self.standard_callback_dynamic_params: StandardCallbackDynamicParams = (
self.initialize_standard_callback_dynamic_params(kwargs) self.initialize_standard_callback_dynamic_params(kwargs)
) )
self.standard_built_in_tools_params: StandardBuiltInToolsParams = (
self.initialize_standard_built_in_tools_params(kwargs)
)
## TIME TO FIRST TOKEN LOGGING ## ## TIME TO FIRST TOKEN LOGGING ##
self.completion_start_time: Optional[datetime.datetime] = None self.completion_start_time: Optional[datetime.datetime] = None
self._llm_caching_handler: Optional[LLMCachingHandler] = None self._llm_caching_handler: Optional[LLMCachingHandler] = None
@ -369,6 +375,23 @@ class Logging(LiteLLMLoggingBaseClass):
""" """
return _initialize_standard_callback_dynamic_params(kwargs) return _initialize_standard_callback_dynamic_params(kwargs)
def initialize_standard_built_in_tools_params(
self, kwargs: Optional[Dict] = None
) -> StandardBuiltInToolsParams:
"""
Initialize the standard built-in tools params from the kwargs
checks if web_search_options in kwargs or tools and sets the corresponding attribute in StandardBuiltInToolsParams
"""
return StandardBuiltInToolsParams(
web_search_options=StandardBuiltInToolCostTracking._get_web_search_options(
kwargs or {}
),
file_search=StandardBuiltInToolCostTracking._get_file_search_tool_call(
kwargs or {}
),
)
def update_environment_variables( def update_environment_variables(
self, self,
litellm_params: Dict, litellm_params: Dict,
@ -495,6 +518,16 @@ class Logging(LiteLLMLoggingBaseClass):
} }
return data return data
def _get_masked_api_base(self, api_base: str) -> str:
if "key=" in api_base:
# Find the position of "key=" in the string
key_index = api_base.find("key=") + 4
# Mask the last 5 characters after "key="
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
else:
masked_api_base = api_base
return str(masked_api_base)
def _pre_call(self, input, api_key, model=None, additional_args={}): def _pre_call(self, input, api_key, model=None, additional_args={}):
""" """
Common helper function across the sync + async pre-call function Common helper function across the sync + async pre-call function
@ -508,6 +541,9 @@ class Logging(LiteLLMLoggingBaseClass):
model model
): # if model name was changes pre-call, overwrite the initial model call name with the new one ): # if model name was changes pre-call, overwrite the initial model call name with the new one
self.model_call_details["model"] = model self.model_call_details["model"] = model
self.model_call_details["litellm_params"]["api_base"] = (
self._get_masked_api_base(additional_args.get("api_base", ""))
)
def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915 def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915
@ -691,15 +727,6 @@ class Logging(LiteLLMLoggingBaseClass):
headers = {} headers = {}
data = additional_args.get("complete_input_dict", {}) data = additional_args.get("complete_input_dict", {})
api_base = str(additional_args.get("api_base", "")) api_base = str(additional_args.get("api_base", ""))
if "key=" in api_base:
# Find the position of "key=" in the string
key_index = api_base.find("key=") + 4
# Mask the last 5 characters after "key="
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
else:
masked_api_base = api_base
self.model_call_details["litellm_params"]["api_base"] = masked_api_base
curl_command = self._get_request_curl_command( curl_command = self._get_request_curl_command(
api_base=api_base, api_base=api_base,
headers=headers, headers=headers,
@ -714,11 +741,12 @@ class Logging(LiteLLMLoggingBaseClass):
def _get_request_curl_command( def _get_request_curl_command(
self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
) -> str: ) -> str:
masked_api_base = self._get_masked_api_base(api_base)
if headers is None: if headers is None:
headers = {} headers = {}
curl_command = "\n\nPOST Request Sent from LiteLLM:\n" curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
curl_command += "curl -X POST \\\n" curl_command += "curl -X POST \\\n"
curl_command += f"{api_base} \\\n" curl_command += f"{masked_api_base} \\\n"
masked_headers = self._get_masked_headers(headers) masked_headers = self._get_masked_headers(headers)
formatted_headers = " ".join( formatted_headers = " ".join(
[f"-H '{k}: {v}'" for k, v in masked_headers.items()] [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
@ -903,6 +931,7 @@ class Logging(LiteLLMLoggingBaseClass):
"optional_params": self.optional_params, "optional_params": self.optional_params,
"custom_pricing": custom_pricing, "custom_pricing": custom_pricing,
"prompt": prompt, "prompt": prompt,
"standard_built_in_tools_params": self.standard_built_in_tools_params,
} }
except Exception as e: # error creating kwargs for cost calculation except Exception as e: # error creating kwargs for cost calculation
debug_info = StandardLoggingModelCostFailureDebugInformation( debug_info = StandardLoggingModelCostFailureDebugInformation(
@ -1067,6 +1096,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time, end_time=end_time,
logging_obj=self, logging_obj=self,
status="success", status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
) )
elif isinstance(result, dict): # pass-through endpoints elif isinstance(result, dict): # pass-through endpoints
@ -1079,6 +1109,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time, end_time=end_time,
logging_obj=self, logging_obj=self,
status="success", status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
) )
elif standard_logging_object is not None: elif standard_logging_object is not None:
@ -1102,6 +1133,7 @@ class Logging(LiteLLMLoggingBaseClass):
prompt="", prompt="",
completion=getattr(result, "content", ""), completion=getattr(result, "content", ""),
total_time=float_diff, total_time=float_diff,
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
return start_time, end_time, result return start_time, end_time, result
@ -1155,6 +1187,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time, end_time=end_time,
logging_obj=self, logging_obj=self,
status="success", status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
) )
callbacks = self.get_combined_callback_list( callbacks = self.get_combined_callback_list(
@ -1695,6 +1728,7 @@ class Logging(LiteLLMLoggingBaseClass):
end_time=end_time, end_time=end_time,
logging_obj=self, logging_obj=self,
status="success", status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
) )
callbacks = self.get_combined_callback_list( callbacks = self.get_combined_callback_list(
@ -1911,6 +1945,7 @@ class Logging(LiteLLMLoggingBaseClass):
status="failure", status="failure",
error_str=str(exception), error_str=str(exception),
original_exception=exception, original_exception=exception,
standard_built_in_tools_params=self.standard_built_in_tools_params,
) )
) )
return start_time, end_time return start_time, end_time
@ -3367,6 +3402,7 @@ def get_standard_logging_object_payload(
status: StandardLoggingPayloadStatus, status: StandardLoggingPayloadStatus,
error_str: Optional[str] = None, error_str: Optional[str] = None,
original_exception: Optional[Exception] = None, original_exception: Optional[Exception] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> Optional[StandardLoggingPayload]: ) -> Optional[StandardLoggingPayload]:
try: try:
kwargs = kwargs or {} kwargs = kwargs or {}
@ -3542,6 +3578,7 @@ def get_standard_logging_object_payload(
guardrail_information=metadata.get( guardrail_information=metadata.get(
"standard_logging_guardrail_information", None "standard_logging_guardrail_information", None
), ),
standard_built_in_tools_params=standard_built_in_tools_params,
) )
emit_standard_logging_payload(payload) emit_standard_logging_payload(payload)

View file

@ -0,0 +1,199 @@
"""
Helper utilities for tracking the cost of built-in tools.
"""
from typing import Any, Dict, List, Optional
import litellm
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import (
ModelInfo,
ModelResponse,
SearchContextCostPerQuery,
StandardBuiltInToolsParams,
)
class StandardBuiltInToolCostTracking:
"""
Helper class for tracking the cost of built-in tools
Example: Web Search
"""
@staticmethod
def get_cost_for_built_in_tools(
model: str,
response_object: Any,
custom_llm_provider: Optional[str] = None,
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
) -> float:
"""
Get the cost of using built-in tools.
Supported tools:
- Web Search
"""
if standard_built_in_tools_params is not None:
if (
standard_built_in_tools_params.get("web_search_options", None)
is not None
):
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
return StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=standard_built_in_tools_params.get(
"web_search_options", None
),
model_info=model_info,
)
if standard_built_in_tools_params.get("file_search", None) is not None:
return StandardBuiltInToolCostTracking.get_cost_for_file_search(
file_search=standard_built_in_tools_params.get("file_search", None),
)
if isinstance(response_object, ModelResponse):
if StandardBuiltInToolCostTracking.chat_completion_response_includes_annotations(
response_object
):
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
model_info
)
return 0.0
@staticmethod
def _safe_get_model_info(
model: str, custom_llm_provider: Optional[str] = None
) -> Optional[ModelInfo]:
try:
return litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
except Exception:
return None
@staticmethod
def get_cost_for_web_search(
web_search_options: Optional[WebSearchOptions] = None,
model_info: Optional[ModelInfo] = None,
) -> float:
"""
If request includes `web_search_options`, calculate the cost of the web search.
"""
if web_search_options is None:
return 0.0
if model_info is None:
return 0.0
search_context_pricing: SearchContextCostPerQuery = (
model_info.get("search_context_cost_per_query", {}) or {}
)
if web_search_options.get("search_context_size", None) == "low":
return search_context_pricing.get("search_context_size_low", 0.0)
elif web_search_options.get("search_context_size", None) == "medium":
return search_context_pricing.get("search_context_size_medium", 0.0)
elif web_search_options.get("search_context_size", None) == "high":
return search_context_pricing.get("search_context_size_high", 0.0)
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
model_info
)
@staticmethod
def get_default_cost_for_web_search(
model_info: Optional[ModelInfo] = None,
) -> float:
"""
If no web search options are provided, use the `search_context_size_medium` pricing.
https://platform.openai.com/docs/pricing#web-search
"""
if model_info is None:
return 0.0
search_context_pricing: SearchContextCostPerQuery = (
model_info.get("search_context_cost_per_query", {}) or {}
) or {}
return search_context_pricing.get("search_context_size_medium", 0.0)
@staticmethod
def get_cost_for_file_search(
file_search: Optional[FileSearchTool] = None,
) -> float:
""" "
Charged at $2.50/1k calls
Doc: https://platform.openai.com/docs/pricing#built-in-tools
"""
if file_search is None:
return 0.0
return 2.5 / 1000
@staticmethod
def chat_completion_response_includes_annotations(
response_object: ModelResponse,
) -> bool:
for _choice in response_object.choices:
message = getattr(_choice, "message", None)
if (
message is not None
and hasattr(message, "annotations")
and message.annotations is not None
and len(message.annotations) > 0
):
return True
return False
@staticmethod
def _get_web_search_options(kwargs: Dict) -> Optional[WebSearchOptions]:
if "web_search_options" in kwargs:
return WebSearchOptions(**kwargs.get("web_search_options", {}))
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
kwargs, "web_search_preview"
)
if tools:
# Look for web search tool in the tools array
for tool in tools:
if isinstance(tool, dict):
if StandardBuiltInToolCostTracking._is_web_search_tool_call(tool):
return WebSearchOptions(**tool)
return None
@staticmethod
def _get_tools_from_kwargs(kwargs: Dict, tool_type: str) -> Optional[List[Dict]]:
if "tools" in kwargs:
tools = kwargs.get("tools", [])
return tools
return None
@staticmethod
def _get_file_search_tool_call(kwargs: Dict) -> Optional[FileSearchTool]:
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
kwargs, "file_search"
)
if tools:
for tool in tools:
if isinstance(tool, dict):
if StandardBuiltInToolCostTracking._is_file_search_tool_call(tool):
return FileSearchTool(**tool)
return None
@staticmethod
def _is_web_search_tool_call(tool: Dict) -> bool:
if tool.get("type", None) == "web_search_preview":
return True
if "search_context_size" in tool:
return True
return False
@staticmethod
def _is_file_search_tool_call(tool: Dict) -> bool:
if tool.get("type", None) == "file_search":
return True
return False

View file

@ -138,13 +138,22 @@ class ModelParamHelper:
TranscriptionCreateParamsNonStreaming, TranscriptionCreateParamsNonStreaming,
TranscriptionCreateParamsStreaming, TranscriptionCreateParamsStreaming,
) )
non_streaming_kwargs = set(getattr(TranscriptionCreateParamsNonStreaming, "__annotations__", {}).keys())
streaming_kwargs = set(getattr(TranscriptionCreateParamsStreaming, "__annotations__", {}).keys()) non_streaming_kwargs = set(
getattr(
TranscriptionCreateParamsNonStreaming, "__annotations__", {}
).keys()
)
streaming_kwargs = set(
getattr(
TranscriptionCreateParamsStreaming, "__annotations__", {}
).keys()
)
all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs) all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs)
return all_transcription_kwargs return all_transcription_kwargs
except Exception as e: except Exception as e:
verbose_logger.warning("Error getting transcription kwargs %s", str(e)) verbose_logger.debug("Error getting transcription kwargs %s", str(e))
return set() return set()
@staticmethod @staticmethod

View file

@ -5304,6 +5304,17 @@
"mode": "embedding", "mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models" "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
}, },
"text-embedding-large-exp-03-07": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"output_vector_size": 3072,
"input_cost_per_character": 0.000000025,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0,
"litellm_provider": "vertex_ai-embedding-models",
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"textembedding-gecko": { "textembedding-gecko": {
"max_tokens": 3072, "max_tokens": 3072,
"max_input_tokens": 3072, "max_input_tokens": 3072,

View file

@ -5,7 +5,10 @@ model_list:
api_key: os.environ/AZURE_API_KEY api_key: os.environ/AZURE_API_KEY
api_base: http://0.0.0.0:8090 api_base: http://0.0.0.0:8090
rpm: 3 rpm: 3
- model_name: "gpt-4o-mini-openai"
litellm_params:
model: gpt-4o-mini
api_key: os.environ/OPENAI_API_KEY
litellm_settings: litellm_settings:
num_retries: 0 num_retries: 0

View file

@ -542,13 +542,10 @@ async def vertex_proxy_route(
user_api_key_dict, user_api_key_dict,
stream=is_streaming_request, # type: ignore stream=is_streaming_request, # type: ignore
) )
except Exception as e: except ProxyException as e:
if headers_passed_through: if headers_passed_through:
raise Exception( e.message = f"No credentials found on proxy for project_name={vertex_project} + location={vertex_location}, check `/model/info` for allowed project + region combinations with `use_in_pass_through: true`. Headers were passed through directly but request failed with error: {e.message}"
f"No credentials found on proxy for this request. Headers were passed through directly but request failed with error: {str(e)}" raise e
)
else:
raise e
return received_value return received_value

View file

@ -1788,9 +1788,6 @@ class ProxyConfig:
reset_color_code, reset_color_code,
cache_password, cache_password,
) )
if cache_type == "redis-semantic":
# by default this should always be async
cache_params.update({"redis_semantic_cache_use_async": True})
# users can pass os.environ/ variables on the proxy - we should read them from the env # users can pass os.environ/ variables on the proxy - we should read them from the env
for key, value in cache_params.items(): for key, value in cache_params.items():
@ -6181,18 +6178,18 @@ async def model_info_v1( # noqa: PLR0915
) )
if len(all_models_str) > 0: if len(all_models_str) > 0:
model_names = all_models_str _relevant_models = []
llm_model_list = llm_router.get_model_list() for model in all_models_str:
router_models = llm_router.get_model_list(model_name=model)
if router_models is not None:
_relevant_models.extend(router_models)
if llm_model_list is not None: if llm_model_list is not None:
_relevant_models = [
m for m in llm_model_list if m["model_name"] in model_names
]
all_models = copy.deepcopy(_relevant_models) # type: ignore all_models = copy.deepcopy(_relevant_models) # type: ignore
else: else:
all_models = [] all_models = []
for model in all_models: for in_place_model in all_models:
model = _get_proxy_model_info(model=model) in_place_model = _get_proxy_model_info(model=in_place_model)
verbose_proxy_logger.debug("all_models: %s", all_models) verbose_proxy_logger.debug("all_models: %s", all_models)
return {"data": all_models} return {"data": all_models}

View file

@ -4924,6 +4924,11 @@ class Router:
and model_info["supports_function_calling"] is True # type: ignore and model_info["supports_function_calling"] is True # type: ignore
): ):
model_group_info.supports_function_calling = True model_group_info.supports_function_calling = True
if (
model_info.get("supports_web_search", None) is not None
and model_info["supports_web_search"] is True # type: ignore
):
model_group_info.supports_web_search = True
if ( if (
model_info.get("supported_openai_params", None) is not None model_info.get("supported_openai_params", None) is not None
and model_info["supported_openai_params"] is not None and model_info["supported_openai_params"] is not None
@ -5286,10 +5291,11 @@ class Router:
if len(returned_models) == 0: # check if wildcard route if len(returned_models) == 0: # check if wildcard route
potential_wildcard_models = self.pattern_router.route(model_name) potential_wildcard_models = self.pattern_router.route(model_name)
if potential_wildcard_models is not None: if model_name is not None and potential_wildcard_models is not None:
returned_models.extend( for m in potential_wildcard_models:
[DeploymentTypedDict(**m) for m in potential_wildcard_models] # type: ignore deployment_typed_dict = DeploymentTypedDict(**m) # type: ignore
) deployment_typed_dict["model_name"] = model_name
returned_models.append(deployment_typed_dict)
if model_name is None: if model_name is None:
returned_models += self.model_list returned_models += self.model_list

View file

@ -382,6 +382,53 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
cache_control: Optional[Union[dict, ChatCompletionCachedContent]] cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
city: str
"""Free text input for the city of the user, e.g. `San Francisco`."""
country: str
"""
The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
the user, e.g. `US`.
"""
region: str
"""Free text input for the region of the user, e.g. `California`."""
timezone: str
"""
The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
user, e.g. `America/Los_Angeles`.
"""
class WebSearchOptionsUserLocation(TypedDict, total=False):
approximate: Required[WebSearchOptionsUserLocationApproximate]
"""Approximate location parameters for the search."""
type: Required[Literal["approximate"]]
"""The type of location approximation. Always `approximate`."""
class WebSearchOptions(TypedDict, total=False):
search_context_size: Literal["low", "medium", "high"]
"""
High level guidance for the amount of context window space to use for the
search. One of `low`, `medium`, or `high`. `medium` is the default.
"""
user_location: Optional[WebSearchOptionsUserLocation]
"""Approximate location parameters for the search."""
class FileSearchTool(TypedDict, total=False):
type: Literal["file_search"]
"""The type of tool being defined: `file_search`"""
vector_store_ids: Optional[List[str]]
"""The IDs of the vector stores to search."""
class ChatCompletionAnnotationURLCitation(TypedDict, total=False): class ChatCompletionAnnotationURLCitation(TypedDict, total=False):
end_index: int end_index: int
"""The index of the last character of the URL citation in the message.""" """The index of the last character of the URL citation in the message."""

View file

@ -559,6 +559,7 @@ class ModelGroupInfo(BaseModel):
rpm: Optional[int] = None rpm: Optional[int] = None
supports_parallel_function_calling: bool = Field(default=False) supports_parallel_function_calling: bool = Field(default=False)
supports_vision: bool = Field(default=False) supports_vision: bool = Field(default=False)
supports_web_search: bool = Field(default=False)
supports_function_calling: bool = Field(default=False) supports_function_calling: bool = Field(default=False)
supported_openai_params: Optional[List[str]] = Field(default=[]) supported_openai_params: Optional[List[str]] = Field(default=[])
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None

View file

@ -32,7 +32,9 @@ from .llms.openai import (
ChatCompletionThinkingBlock, ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk, ChatCompletionToolCallChunk,
ChatCompletionUsageBlock, ChatCompletionUsageBlock,
FileSearchTool,
OpenAIChatCompletionChunk, OpenAIChatCompletionChunk,
WebSearchOptions,
) )
from .rerank import RerankResponse from .rerank import RerankResponse
@ -97,6 +99,13 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
supports_pdf_input: Optional[bool] supports_pdf_input: Optional[bool]
supports_native_streaming: Optional[bool] supports_native_streaming: Optional[bool]
supports_parallel_function_calling: Optional[bool] supports_parallel_function_calling: Optional[bool]
supports_web_search: Optional[bool]
class SearchContextCostPerQuery(TypedDict, total=False):
search_context_size_low: float
search_context_size_medium: float
search_context_size_high: float
class ModelInfoBase(ProviderSpecificModelInfo, total=False): class ModelInfoBase(ProviderSpecificModelInfo, total=False):
@ -135,6 +144,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
output_cost_per_video_per_second: Optional[float] # only for vertex ai models output_cost_per_video_per_second: Optional[float] # only for vertex ai models
output_cost_per_audio_per_second: Optional[float] # only for vertex ai models output_cost_per_audio_per_second: Optional[float] # only for vertex ai models
output_cost_per_second: Optional[float] # for OpenAI Speech models output_cost_per_second: Optional[float] # for OpenAI Speech models
search_context_cost_per_query: Optional[
SearchContextCostPerQuery
] # Cost for using web search tool
litellm_provider: Required[str] litellm_provider: Required[str]
mode: Required[ mode: Required[
@ -586,6 +598,11 @@ class Message(OpenAIObject):
# OpenAI compatible APIs like mistral API will raise an error if audio is passed in # OpenAI compatible APIs like mistral API will raise an error if audio is passed in
del self.audio del self.audio
if annotations is None:
# ensure default response matches OpenAI spec
# Some OpenAI compatible APIs raise an error if annotations are passed in
del self.annotations
if reasoning_content is None: if reasoning_content is None:
# ensure default response matches OpenAI spec # ensure default response matches OpenAI spec
del self.reasoning_content del self.reasoning_content
@ -1612,6 +1629,19 @@ class StandardLoggingUserAPIKeyMetadata(TypedDict):
user_api_key_end_user_id: Optional[str] user_api_key_end_user_id: Optional[str]
class StandardBuiltInToolsParams(TypedDict, total=False):
"""
Standard built-in OpenAItools parameters
This is used to calculate the cost of built-in tools, insert any standard built-in tools parameters here
OpenAI charges users based on the `web_search_options` parameter
"""
web_search_options: Optional[WebSearchOptions]
file_search: Optional[FileSearchTool]
class StandardLoggingPromptManagementMetadata(TypedDict): class StandardLoggingPromptManagementMetadata(TypedDict):
prompt_id: str prompt_id: str
prompt_variables: Optional[dict] prompt_variables: Optional[dict]
@ -1729,6 +1759,7 @@ class StandardLoggingPayload(TypedDict):
model_parameters: dict model_parameters: dict
hidden_params: StandardLoggingHiddenParams hidden_params: StandardLoggingHiddenParams
guardrail_information: Optional[StandardLoggingGuardrailInformation] guardrail_information: Optional[StandardLoggingGuardrailInformation]
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams]
from typing import AsyncIterator, Iterator from typing import AsyncIterator, Iterator

View file

@ -1975,7 +1975,7 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
) )
def supports_web_search(model: str, custom_llm_provider: Optional[str]) -> bool: def supports_web_search(model: str, custom_llm_provider: Optional[str] = None) -> bool:
""" """
Check if the given model supports web search and return a boolean value. Check if the given model supports web search and return a boolean value.
@ -4544,6 +4544,10 @@ def _get_model_info_helper( # noqa: PLR0915
supports_native_streaming=_model_info.get( supports_native_streaming=_model_info.get(
"supports_native_streaming", None "supports_native_streaming", None
), ),
supports_web_search=_model_info.get("supports_web_search", False),
search_context_cost_per_query=_model_info.get(
"search_context_cost_per_query", None
),
tpm=_model_info.get("tpm", None), tpm=_model_info.get("tpm", None),
rpm=_model_info.get("rpm", None), rpm=_model_info.get("rpm", None),
) )
@ -4612,6 +4616,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
supports_audio_input: Optional[bool] supports_audio_input: Optional[bool]
supports_audio_output: Optional[bool] supports_audio_output: Optional[bool]
supports_pdf_input: Optional[bool] supports_pdf_input: Optional[bool]
supports_web_search: Optional[bool]
Raises: Raises:
Exception: If the model is not mapped yet. Exception: If the model is not mapped yet.

View file

@ -5304,6 +5304,17 @@
"mode": "embedding", "mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models" "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
}, },
"text-embedding-large-exp-03-07": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"output_vector_size": 3072,
"input_cost_per_character": 0.000000025,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0,
"litellm_provider": "vertex_ai-embedding-models",
"mode": "embedding",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
},
"textembedding-gecko": { "textembedding-gecko": {
"max_tokens": 3072, "max_tokens": 3072,
"max_input_tokens": 3072, "max_input_tokens": 3072,

78
poetry.lock generated
View file

@ -810,15 +810,15 @@ test = ["pytest (>=6)"]
[[package]] [[package]]
name = "fastapi" name = "fastapi"
version = "0.115.11" version = "0.115.12"
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
markers = "extra == \"proxy\"" markers = "extra == \"proxy\""
files = [ files = [
{file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"}, {file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"},
{file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"}, {file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"},
] ]
[package.dependencies] [package.dependencies]
@ -1445,14 +1445,14 @@ type = ["pytest-mypy"]
[[package]] [[package]]
name = "iniconfig" name = "iniconfig"
version = "2.0.0" version = "2.1.0"
description = "brain-dead simple config-ini parsing" description = "brain-dead simple config-ini parsing"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.8"
groups = ["dev"] groups = ["dev"]
files = [ files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
] ]
[[package]] [[package]]
@ -2137,14 +2137,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]] [[package]]
name = "openai" name = "openai"
version = "1.66.3" version = "1.68.2"
description = "The official Python library for the openai API" description = "The official Python library for the openai API"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9"}, {file = "openai-1.68.2-py3-none-any.whl", hash = "sha256:24484cb5c9a33b58576fdc5acf0e5f92603024a4e39d0b99793dfa1eb14c2b36"},
{file = "openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9"}, {file = "openai-1.68.2.tar.gz", hash = "sha256:b720f0a95a1dbe1429c0d9bb62096a0d98057bcda82516f6e8af10284bdd5b19"},
] ]
[package.dependencies] [package.dependencies]
@ -2160,6 +2160,7 @@ typing-extensions = ">=4.11,<5"
[package.extras] [package.extras]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<15)"] realtime = ["websockets (>=13,<15)"]
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
[[package]] [[package]]
name = "orjson" name = "orjson"
@ -2477,24 +2478,24 @@ testing = ["google-api-core (>=1.31.5)"]
[[package]] [[package]]
name = "protobuf" name = "protobuf"
version = "5.29.3" version = "5.29.4"
description = "" description = ""
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
markers = "extra == \"extra-proxy\"" markers = "extra == \"extra-proxy\""
files = [ files = [
{file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"}, {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
{file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"}, {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
{file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"}, {file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"}, {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"}, {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
{file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"}, {file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
{file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"}, {file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
{file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"}, {file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
{file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"}, {file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
{file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"}, {file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
{file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"}, {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
] ]
[[package]] [[package]]
@ -2809,6 +2810,25 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras] [package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-asyncio"
version = "0.21.2"
description = "Pytest support for asyncio"
optional = false
python-versions = ">=3.7"
groups = ["dev"]
files = [
{file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
{file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
]
[package.dependencies]
pytest = ">=7.0.0"
[package.extras]
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
[[package]] [[package]]
name = "pytest-mock" name = "pytest-mock"
version = "3.14.0" version = "3.14.0"
@ -3279,15 +3299,15 @@ files = [
[[package]] [[package]]
name = "rq" name = "rq"
version = "2.1.0" version = "2.2.0"
description = "RQ is a simple, lightweight, library for creating background jobs, and processing them." description = "RQ is a simple, lightweight, library for creating background jobs, and processing them."
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
markers = "extra == \"proxy\"" markers = "extra == \"proxy\""
files = [ files = [
{file = "rq-2.1.0-py3-none-any.whl", hash = "sha256:3c6892c6ca848e5fb47c1875399a66f13656bf0e123bf725d9aa9a12718e2fdf"}, {file = "rq-2.2.0-py3-none-any.whl", hash = "sha256:dacbfe1ccb79a45c8cd95dec7951620679fa0195570b63da3f9347622d33accc"},
{file = "rq-2.1.0.tar.gz", hash = "sha256:764585b6cab69ef1412f4aee523347e5aa7ece3ca175c118b1d92223dd8c2826"}, {file = "rq-2.2.0.tar.gz", hash = "sha256:b636760f1e4c183022031c142faa0483e687885824e9732ba2953f994104e203"},
] ]
[package.dependencies] [package.dependencies]
@ -3606,15 +3626,15 @@ files = [
[[package]] [[package]]
name = "tzdata" name = "tzdata"
version = "2025.1" version = "2025.2"
description = "Provider of IANA time zone data" description = "Provider of IANA time zone data"
optional = true optional = true
python-versions = ">=2" python-versions = ">=2"
groups = ["main"] groups = ["main"]
markers = "extra == \"proxy\" and platform_system == \"Windows\"" markers = "extra == \"proxy\" and platform_system == \"Windows\""
files = [ files = [
{file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
{file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
] ]
[[package]] [[package]]
@ -3985,4 +4005,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.8.1,<4.0, !=3.9.7" python-versions = ">=3.8.1,<4.0, !=3.9.7"
content-hash = "f7c21b3d659e4a15cd46bb42fb905ad039028f4f6b82507fd1278ac05c412569" content-hash = "9c863b11189227a035a9130c8872de44fe7c5e1e32b47569a56af86e3f6570c5"

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.63.14" version = "1.64.0"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -98,13 +98,14 @@ black = "^23.12.0"
mypy = "^1.0" mypy = "^1.0"
pytest = "^7.4.3" pytest = "^7.4.3"
pytest-mock = "^3.12.0" pytest-mock = "^3.12.0"
pytest-asyncio = "^0.21.1"
[build-system] [build-system]
requires = ["poetry-core", "wheel"] requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.63.14" version = "1.64.0"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

View file

@ -9,8 +9,8 @@ uvicorn==0.29.0 # server dep
gunicorn==23.0.0 # server dep gunicorn==23.0.0 # server dep
uvloop==0.21.0 # uvicorn dep, gives us much better performance under load uvloop==0.21.0 # uvicorn dep, gives us much better performance under load
boto3==1.34.34 # aws bedrock/sagemaker calls boto3==1.34.34 # aws bedrock/sagemaker calls
redis==5.0.0 # caching redis==5.2.1 # redis caching
numpy==2.1.1 # semantic caching redisvl==0.4.1 # semantic caching
prisma==0.11.0 # for db prisma==0.11.0 # for db
mangum==0.17.0 # for aws lambda functions mangum==0.17.0 # for aws lambda functions
pynacl==1.5.0 # for encrypting keys pynacl==1.5.0 # for encrypting keys

View file

@ -1,13 +1,8 @@
import asyncio
import json
import os import os
import sys import sys
import time
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import httpx
import pytest import pytest
import respx
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
sys.path.insert( sys.path.insert(
@ -18,9 +13,18 @@ from unittest.mock import AsyncMock
from litellm.caching.redis_cache import RedisCache from litellm.caching.redis_cache import RedisCache
@pytest.fixture
def redis_no_ping():
"""Patch RedisCache initialization to prevent async ping tasks from being created"""
with patch('asyncio.get_running_loop') as mock_get_loop:
# Either raise an exception or return a mock that will handle the task creation
mock_get_loop.side_effect = RuntimeError("No running event loop")
yield
@pytest.mark.parametrize("namespace", [None, "test"]) @pytest.mark.parametrize("namespace", [None, "test"])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_redis_cache_async_increment(namespace, monkeypatch): async def test_redis_cache_async_increment(namespace, monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "https://my-test-host") monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
redis_cache = RedisCache(namespace=namespace) redis_cache = RedisCache(namespace=namespace)
# Create an AsyncMock for the Redis client # Create an AsyncMock for the Redis client
@ -47,10 +51,46 @@ async def test_redis_cache_async_increment(namespace, monkeypatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_redis_client_init_with_socket_timeout(monkeypatch): async def test_redis_client_init_with_socket_timeout(monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "my-fake-host") monkeypatch.setenv("REDIS_HOST", "my-fake-host")
redis_cache = RedisCache(socket_timeout=1.0) redis_cache = RedisCache(socket_timeout=1.0)
assert redis_cache.redis_kwargs["socket_timeout"] == 1.0 assert redis_cache.redis_kwargs["socket_timeout"] == 1.0
client = redis_cache.init_async_client() client = redis_cache.init_async_client()
assert client is not None assert client is not None
assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0 assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0
@pytest.mark.asyncio
async def test_redis_cache_async_batch_get_cache(monkeypatch, redis_no_ping):
monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
redis_cache = RedisCache()
# Create an AsyncMock for the Redis client
mock_redis_instance = AsyncMock()
# Make sure the mock can be used as an async context manager
mock_redis_instance.__aenter__.return_value = mock_redis_instance
mock_redis_instance.__aexit__.return_value = None
# Setup the return value for mget
mock_redis_instance.mget.return_value = [
b'{"key1": "value1"}',
None,
b'{"key3": "value3"}'
]
test_keys = ["key1", "key2", "key3"]
with patch.object(
redis_cache, "init_async_client", return_value=mock_redis_instance
):
# Call async_batch_get_cache
result = await redis_cache.async_batch_get_cache(key_list=test_keys)
# Verify mget was called with the correct keys
mock_redis_instance.mget.assert_called_once()
# Check that results were properly decoded
assert result["key1"] == {"key1": "value1"}
assert result["key2"] is None
assert result["key3"] == {"key3": "value3"}

View file

@ -0,0 +1,130 @@
import os
import sys
from unittest.mock import MagicMock, patch, AsyncMock
import pytest
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
# Tests for RedisSemanticCache
def test_redis_semantic_cache_initialization(monkeypatch):
# Mock the redisvl import
semantic_cache_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=MagicMock())
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize the cache with a similarity threshold
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Verify the semantic cache was initialized with correct parameters
assert redis_semantic_cache.similarity_threshold == 0.8
# Use pytest.approx for floating point comparison to handle precision issues
assert redis_semantic_cache.distance_threshold == pytest.approx(0.2, abs=1e-10)
assert redis_semantic_cache.embedding_model == "text-embedding-ada-002"
# Test initialization with missing similarity_threshold
with pytest.raises(ValueError, match="similarity_threshold must be provided"):
RedisSemanticCache()
def test_redis_semantic_cache_get_cache(monkeypatch):
# Mock the redisvl import and embedding function
semantic_cache_mock = MagicMock()
custom_vectorizer_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize cache
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Mock the llmcache.check method to return a result
mock_result = [
{
"prompt": "What is the capital of France?",
"response": '{"content": "Paris is the capital of France."}',
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
}
]
redis_semantic_cache.llmcache.check = MagicMock(return_value=mock_result)
# Mock the embedding function
with patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}):
# Test get_cache with a message
result = redis_semantic_cache.get_cache(
key="test_key",
messages=[{"content": "What is the capital of France?"}]
)
# Verify result is properly parsed
assert result == {"content": "Paris is the capital of France."}
# Verify llmcache.check was called
redis_semantic_cache.llmcache.check.assert_called_once()
@pytest.mark.asyncio
async def test_redis_semantic_cache_async_get_cache(monkeypatch):
# Mock the redisvl import
semantic_cache_mock = MagicMock()
custom_vectorizer_mock = MagicMock()
with patch.dict("sys.modules", {
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
}):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
# Set environment variables
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
# Initialize cache
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
# Mock the async methods
mock_result = [
{
"prompt": "What is the capital of France?",
"response": '{"content": "Paris is the capital of France."}',
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
}
]
redis_semantic_cache.llmcache.acheck = AsyncMock(return_value=mock_result)
redis_semantic_cache._get_async_embedding = AsyncMock(return_value=[0.1, 0.2, 0.3])
# Test async_get_cache with a message
result = await redis_semantic_cache.async_get_cache(
key="test_key",
messages=[{"content": "What is the capital of France?"}],
metadata={}
)
# Verify result is properly parsed
assert result == {"content": "Paris is the capital of France."}
# Verify methods were called
redis_semantic_cache._get_async_embedding.assert_called_once()
redis_semantic_cache.llmcache.acheck.assert_called_once()

View file

@ -0,0 +1,113 @@
import json
import os
import sys
import pytest
from fastapi.testclient import TestClient
import litellm
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
# Test basic web search cost calculations
def test_web_search_cost_low():
web_search_options = WebSearchOptions(search_context_size="low")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost == model_info["search_context_cost_per_query"]["search_context_size_low"]
)
def test_web_search_cost_medium():
web_search_options = WebSearchOptions(search_context_size="medium")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost
== model_info["search_context_cost_per_query"]["search_context_size_medium"]
)
def test_web_search_cost_high():
web_search_options = WebSearchOptions(search_context_size="high")
model_info = litellm.get_model_info("gpt-4o-search-preview")
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=web_search_options, model_info=model_info
)
assert (
cost == model_info["search_context_cost_per_query"]["search_context_size_high"]
)
# Test file search cost calculation
def test_file_search_cost():
file_search = FileSearchTool(type="file_search")
cost = StandardBuiltInToolCostTracking.get_cost_for_file_search(
file_search=file_search
)
assert cost == 0.0025 # $2.50/1000 calls = 0.0025 per call
# Test edge cases
def test_none_inputs():
# Test with None inputs
assert (
StandardBuiltInToolCostTracking.get_cost_for_web_search(
web_search_options=None, model_info=None
)
== 0.0
)
assert (
StandardBuiltInToolCostTracking.get_cost_for_file_search(file_search=None)
== 0.0
)
# Test the main get_cost_for_built_in_tools method
def test_get_cost_for_built_in_tools_web_search():
model = "gpt-4"
standard_built_in_tools_params = StandardBuiltInToolsParams(
web_search_options=WebSearchOptions(search_context_size="medium")
)
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=None,
standard_built_in_tools_params=standard_built_in_tools_params,
)
assert isinstance(cost, float)
def test_get_cost_for_built_in_tools_file_search():
model = "gpt-4"
standard_built_in_tools_params = StandardBuiltInToolsParams(
file_search=FileSearchTool(type="file_search")
)
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
model=model,
response_object=None,
standard_built_in_tools_params=standard_built_in_tools_params,
)
assert cost == 0.0025

View file

@ -0,0 +1,34 @@
import json
import os
import sys
from unittest.mock import MagicMock, patch
import pytest
sys.path.insert(
0, os.path.abspath("../../..")
) # Adds the parent directory to the system path
import time
from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
@pytest.fixture
def logging_obj():
return LitellmLogging(
model="bedrock/claude-3-5-sonnet-20240620-v1:0",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
)
def test_get_masked_api_base(logging_obj):
api_base = "https://api.openai.com/v1"
masked_api_base = logging_obj._get_masked_api_base(api_base)
assert masked_api_base == "https://api.openai.com/v1"
assert type(masked_api_base) == str

View file

@ -1,3 +1,4 @@
import asyncio
import datetime import datetime
import json import json
import os import os
@ -11,7 +12,13 @@ sys.path.insert(
0, os.path.abspath("../../../..") 0, os.path.abspath("../../../..")
) # Adds the parent directory to the system path ) # Adds the parent directory to the system path
from unittest.mock import MagicMock, patch
import litellm
from litellm.proxy._types import SpendLogsPayload
from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
from litellm.proxy.proxy_server import app, prisma_client from litellm.proxy.proxy_server import app, prisma_client
from litellm.router import Router
@pytest.fixture @pytest.fixture
@ -400,3 +407,270 @@ async def test_ui_view_spend_logs_unauthorized(client):
headers={"Authorization": "Bearer invalid-token"}, headers={"Authorization": "Bearer invalid-token"},
) )
assert response.status_code == 401 or response.status_code == 403 assert response.status_code == 401 or response.status_code == 403
class TestSpendLogsPayload:
@pytest.mark.asyncio
async def test_spend_logs_payload_e2e(self):
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(litellm.proxy.proxy_server, "prisma_client"):
response = await litellm.acompletion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello, world!"}],
mock_response="Hello, world!",
metadata={"user_api_key_end_user_id": "test_user_1"},
)
assert response.choices[0].message.content == "Hello, world!"
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "gpt-4o",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
"cache_key": "Cache OFF",
"spend": 0.00022500000000000002,
"total_tokens": 30,
"prompt_tokens": 10,
"completion_tokens": 20,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "",
"model_group": "",
"model_id": "",
"requester_ip_address": None,
"custom_llm_provider": "openai",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"
def mock_anthropic_response(*args, **kwargs):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/json"}
mock_response.json.return_value = {
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
"model": "claude-3-7-sonnet-20250219",
"role": "assistant",
"stop_reason": "end_turn",
"stop_sequence": None,
"type": "message",
"usage": {"input_tokens": 2095, "output_tokens": 503},
}
return mock_response
@pytest.mark.asyncio
async def test_spend_logs_payload_success_log_with_api_base(self):
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
client = AsyncHTTPHandler()
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(
litellm.proxy.proxy_server, "prisma_client"
), patch.object(
client, "post", side_effect=self.mock_anthropic_response
):
response = await litellm.acompletion(
model="claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "Hello, world!"}],
metadata={"user_api_key_end_user_id": "test_user_1"},
client=client,
)
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "claude-3-7-sonnet-20250219",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
"cache_key": "Cache OFF",
"spend": 0.01383,
"total_tokens": 2598,
"prompt_tokens": 2095,
"completion_tokens": 503,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "https://api.anthropic.com/v1/messages",
"model_group": "",
"model_id": "",
"requester_ip_address": None,
"custom_llm_provider": "anthropic",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"
@pytest.mark.asyncio
async def test_spend_logs_payload_success_log_with_router(self):
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
# litellm._turn_on_debug()
client = AsyncHTTPHandler()
router = Router(
model_list=[
{
"model_name": "my-anthropic-model-group",
"litellm_params": {
"model": "claude-3-7-sonnet-20250219",
},
"model_info": {
"id": "my-unique-model-id",
},
}
]
)
with patch.object(
litellm.proxy.proxy_server, "_set_spend_logs_payload"
) as mock_client, patch.object(
litellm.proxy.proxy_server, "prisma_client"
), patch.object(
client, "post", side_effect=self.mock_anthropic_response
):
response = await router.acompletion(
model="my-anthropic-model-group",
messages=[{"role": "user", "content": "Hello, world!"}],
metadata={"user_api_key_end_user_id": "test_user_1"},
client=client,
)
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
expected_payload = SpendLogsPayload(
**{
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
"call_type": "acompletion",
"api_key": "",
"cache_hit": "None",
"startTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
),
"endTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"completionStartTime": datetime.datetime(
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
),
"model": "claude-3-7-sonnet-20250219",
"user": "",
"team_id": "",
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
"cache_key": "Cache OFF",
"spend": 0.01383,
"total_tokens": 2598,
"prompt_tokens": 2095,
"completion_tokens": 503,
"request_tags": "[]",
"end_user": "test_user_1",
"api_base": "https://api.anthropic.com/v1/messages",
"model_group": "my-anthropic-model-group",
"model_id": "my-unique-model-id",
"requester_ip_address": None,
"custom_llm_provider": "anthropic",
"messages": "{}",
"response": "{}",
}
)
for key, value in expected_payload.items():
if key in [
"request_id",
"startTime",
"endTime",
"completionStartTime",
"endTime",
]:
assert payload[key] is not None
else:
assert (
payload[key] == value
), f"Expected {key} to be {value}, but got {payload[key]}"

View file

@ -477,6 +477,25 @@ def test_supports_function_calling(model, expected_bool):
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
"model, expected_bool",
[
("gpt-4o-mini-search-preview", True),
("openai/gpt-4o-mini-search-preview", True),
("gpt-4o-search-preview", True),
("openai/gpt-4o-search-preview", True),
("groq/deepseek-r1-distill-llama-70b", False),
("groq/llama-3.3-70b-versatile", False),
("codestral/codestral-latest", False),
],
)
def test_supports_web_search(model, expected_bool):
try:
assert litellm.supports_web_search(model=model) == expected_bool
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_get_max_token_unit_test(): def test_get_max_token_unit_test():
""" """
More complete testing in `test_completion_cost.py` More complete testing in `test_completion_cost.py`

View file

@ -794,7 +794,7 @@ def test_redis_cache_completion():
response3 = completion( response3 = completion(
model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5 model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
) )
response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True) response4 = completion(model="gpt-4o-mini", messages=messages, caching=True)
print("\nresponse 1", response1) print("\nresponse 1", response1)
print("\nresponse 2", response2) print("\nresponse 2", response2)
@ -1690,20 +1690,12 @@ def test_cache_context_managers():
print("VARS of litellm.cache", vars(litellm.cache)) print("VARS of litellm.cache", vars(litellm.cache))
# test_cache_context_managers()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
def test_redis_semantic_cache_completion(): def test_redis_semantic_cache_completion():
litellm.set_verbose = True litellm.set_verbose = True
import logging import logging
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
random_number = random.randint(
1, 100000
) # add a random number to ensure it's always adding /reading from cache
print("testing semantic caching") print("testing semantic caching")
litellm.cache = Cache( litellm.cache = Cache(
type="redis-semantic", type="redis-semantic",
@ -1718,33 +1710,30 @@ def test_redis_semantic_cache_completion():
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": f"write a one sentence poem about: {random_number}", "content": "write a one sentence poem about summer",
} }
], ],
max_tokens=20, max_tokens=20,
) )
print(f"response1: {response1}") print(f"response1: {response1}")
random_number = random.randint(1, 100000)
response2 = completion( response2 = completion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": f"write a one sentence poem about: {random_number}", "content": "write a one sentence poem about summertime",
} }
], ],
max_tokens=20, max_tokens=20,
) )
print(f"response2: {response1}") print(f"response2: {response2}")
assert response1.id == response2.id assert response1.id == response2.id
# test_redis_cache_completion() # test_redis_cache_completion()
@pytest.mark.skip(reason="beta test - new redis semantic cache")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_redis_semantic_cache_acompletion(): async def test_redis_semantic_cache_acompletion():
litellm.set_verbose = True litellm.set_verbose = True
@ -1752,38 +1741,32 @@ async def test_redis_semantic_cache_acompletion():
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
random_number = random.randint(
1, 100000
) # add a random number to ensure it's always adding / reading from cache
print("testing semantic caching") print("testing semantic caching")
litellm.cache = Cache( litellm.cache = Cache(
type="redis-semantic", type="redis-semantic",
host=os.environ["REDIS_HOST"], host=os.environ["REDIS_HOST"],
port=os.environ["REDIS_PORT"], port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"], password=os.environ["REDIS_PASSWORD"],
similarity_threshold=0.8, similarity_threshold=0.7,
redis_semantic_cache_use_async=True,
) )
response1 = await litellm.acompletion( response1 = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": f"write a one sentence poem about: {random_number}", "content": "write a one sentence poem about summer",
} }
], ],
max_tokens=5, max_tokens=5,
) )
print(f"response1: {response1}") print(f"response1: {response1}")
random_number = random.randint(1, 100000)
response2 = await litellm.acompletion( response2 = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": f"write a one sentence poem about: {random_number}", "content": "write a one sentence poem about summertime",
} }
], ],
max_tokens=5, max_tokens=5,

View file

@ -0,0 +1,175 @@
{
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
"trace_id": null,
"call_type": "acompletion",
"cache_hit": null,
"stream": true,
"status": "success",
"custom_llm_provider": "openai",
"saved_cache_cost": 0.0,
"startTime": "2025-01-24 09:20:46.847371",
"endTime": "2025-01-24 09:20:46.851954",
"completionStartTime": "2025-01-24 09:20:46.851954",
"response_time": 0.007394075393676758,
"model": "gpt-4o",
"metadata": {
"user_api_key_hash": null,
"user_api_key_alias": null,
"user_api_key_team_id": null,
"user_api_key_org_id": null,
"user_api_key_user_id": null,
"user_api_key_team_alias": null,
"user_api_key_user_email": null,
"spend_logs_metadata": null,
"requester_ip_address": null,
"requester_metadata": null,
"user_api_key_end_user_id": null,
"prompt_management_metadata": null,
"applied_guardrails": []
},
"cache_key": null,
"response_cost": 0.00022500000000000002,
"total_tokens": 30,
"prompt_tokens": 10,
"completion_tokens": 20,
"request_tags": [],
"end_user": "",
"api_base": "",
"model_group": "",
"model_id": "",
"requester_ip_address": null,
"messages": [
{
"role": "user",
"content": "Hello, world!"
}
],
"response": {
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
"created": 1742855151,
"model": "gpt-4o",
"object": "chat.completion",
"system_fingerprint": null,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "hi",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"usage": {
"completion_tokens": 20,
"prompt_tokens": 10,
"total_tokens": 30,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
},
"model_parameters": {},
"hidden_params": {
"model_id": null,
"cache_key": null,
"api_base": "https://api.openai.com",
"response_cost": 0.00022500000000000002,
"additional_headers": {},
"litellm_overhead_time_ms": null,
"batch_models": null,
"litellm_model_name": "gpt-4o"
},
"model_map_information": {
"model_map_key": "gpt-4o",
"model_map_value": {
"key": "gpt-4o",
"max_tokens": 16384,
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"input_cost_per_token": 2.5e-06,
"cache_creation_input_token_cost": null,
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_character": null,
"input_cost_per_token_above_128k_tokens": null,
"input_cost_per_query": null,
"input_cost_per_second": null,
"input_cost_per_audio_token": null,
"input_cost_per_token_batches": 1.25e-06,
"output_cost_per_token_batches": 5e-06,
"output_cost_per_token": 1e-05,
"output_cost_per_audio_token": null,
"output_cost_per_character": null,
"output_cost_per_token_above_128k_tokens": null,
"output_cost_per_character_above_128k_tokens": null,
"output_cost_per_second": null,
"output_cost_per_image": null,
"output_vector_size": null,
"litellm_provider": "openai",
"mode": "chat",
"supports_system_messages": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"supports_assistant_prefill": false,
"supports_prompt_caching": true,
"supports_audio_input": false,
"supports_audio_output": false,
"supports_pdf_input": false,
"supports_embedding_image_input": false,
"supports_native_streaming": null,
"supports_web_search": true,
"search_context_cost_per_query": {
"search_context_size_low": 0.03,
"search_context_size_medium": 0.035,
"search_context_size_high": 0.05
},
"tpm": null,
"rpm": null,
"supported_openai_params": [
"frequency_penalty",
"logit_bias",
"logprobs",
"top_logprobs",
"max_tokens",
"max_completion_tokens",
"modalities",
"prediction",
"n",
"presence_penalty",
"seed",
"stop",
"stream",
"stream_options",
"temperature",
"top_p",
"tools",
"tool_choice",
"function_call",
"functions",
"max_retries",
"extra_headers",
"parallel_tool_calls",
"audio",
"response_format",
"user"
]
}
},
"error_str": null,
"error_information": {
"error_code": "",
"error_class": "",
"llm_provider": "",
"traceback": "",
"error_message": ""
},
"response_cost_failure_debug_info": null,
"guardrail_information": null,
"standard_built_in_tools_params": {
"web_search_options": null,
"file_search": null
}
}

View file

@ -0,0 +1,151 @@
import os
import sys
import traceback
import uuid
import pytest
from dotenv import load_dotenv
from fastapi import Request
from fastapi.routing import APIRoute
load_dotenv()
import io
import os
import time
import json
# this file is to test litellm/proxy
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
import asyncio
from typing import Optional
from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
from litellm.integrations.custom_logger import CustomLogger
class TestCustomLogger(CustomLogger):
def __init__(self):
self.recorded_usage: Optional[Usage] = None
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
standard_logging_payload = kwargs.get("standard_logging_object")
self.standard_logging_payload = standard_logging_payload
print(
"standard_logging_payload",
json.dumps(standard_logging_payload, indent=4, default=str),
)
self.recorded_usage = Usage(
prompt_tokens=standard_logging_payload.get("prompt_tokens"),
completion_tokens=standard_logging_payload.get("completion_tokens"),
total_tokens=standard_logging_payload.get("total_tokens"),
)
pass
async def _setup_web_search_test():
"""Helper function to setup common test requirements"""
litellm._turn_on_debug()
test_custom_logger = TestCustomLogger()
litellm.callbacks = [test_custom_logger]
return test_custom_logger
async def _verify_web_search_cost(test_custom_logger, expected_context_size):
"""Helper function to verify web search costs"""
await asyncio.sleep(1)
standard_logging_payload = test_custom_logger.standard_logging_payload
response_cost = standard_logging_payload.get("response_cost")
assert response_cost is not None
# Calculate token cost
model_map_information = standard_logging_payload["model_map_information"]
model_map_value: ModelInfoBase = model_map_information["model_map_value"]
total_token_cost = (
standard_logging_payload["prompt_tokens"]
* model_map_value["input_cost_per_token"]
) + (
standard_logging_payload["completion_tokens"]
* model_map_value["output_cost_per_token"]
)
# Verify total cost
assert (
response_cost
== total_token_cost
+ model_map_value["search_context_cost_per_query"][expected_context_size]
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"web_search_options,expected_context_size",
[
(None, "search_context_size_medium"),
({"search_context_size": "low"}, "search_context_size_low"),
({"search_context_size": "high"}, "search_context_size_high"),
],
)
async def test_openai_web_search_logging_cost_tracking(
web_search_options, expected_context_size
):
"""Test web search cost tracking with different search context sizes"""
test_custom_logger = await _setup_web_search_test()
request_kwargs = {
"model": "openai/gpt-4o-search-preview",
"messages": [
{"role": "user", "content": "What was a positive news story from today?"}
],
}
if web_search_options is not None:
request_kwargs["web_search_options"] = web_search_options
response = await litellm.acompletion(**request_kwargs)
await _verify_web_search_cost(test_custom_logger, expected_context_size)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"tools_config,expected_context_size,stream",
[
(
[{"type": "web_search_preview", "search_context_size": "low"}],
"search_context_size_low",
True,
),
(
[{"type": "web_search_preview", "search_context_size": "low"}],
"search_context_size_low",
False,
),
([{"type": "web_search_preview"}], "search_context_size_medium", True),
([{"type": "web_search_preview"}], "search_context_size_medium", False),
],
)
async def test_openai_responses_api_web_search_cost_tracking(
tools_config, expected_context_size, stream
):
"""Test web search cost tracking with different search context sizes and streaming options"""
test_custom_logger = await _setup_web_search_test()
response = await litellm.aresponses(
model="openai/gpt-4o",
input=[
{"role": "user", "content": "What was a positive news story from today?"}
],
tools=tools_config,
stream=stream,
)
if stream is True:
async for chunk in response:
print("chunk", chunk)
else:
print("response", response)
await _verify_web_search_cost(test_custom_logger, expected_context_size)

View file

@ -6,6 +6,7 @@ import sys
sys.path.insert(0, os.path.abspath("../..")) sys.path.insert(0, os.path.abspath("../.."))
import asyncio import asyncio
import litellm
import gzip import gzip
import json import json
import logging import logging
@ -48,8 +49,15 @@ def assert_gcs_pubsub_request_matches_expected(
expected_request_body = json.load(f) expected_request_body = json.load(f)
# Replace dynamic values in actual request body # Replace dynamic values in actual request body
time_fields = ["startTime", "endTime", "completionStartTime", "request_id"] dynamic_fields = [
for field in time_fields: "startTime",
"endTime",
"completionStartTime",
"request_id",
"id",
"response_time",
]
for field in dynamic_fields:
if field in actual_request_body: if field in actual_request_body:
actual_request_body[field] = expected_request_body[field] actual_request_body[field] = expected_request_body[field]
@ -59,6 +67,55 @@ def assert_gcs_pubsub_request_matches_expected(
), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}" ), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"
def assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
actual_request_body: dict,
expected_file_name: str,
):
"""
Helper function to compare actual GCS PubSub request body with expected JSON file.
Args:
actual_request_body (dict): The actual request body received from the API call
expected_file_name (str): Name of the JSON file containing expected request body
"""
# Get the current directory and read the expected request body
pwd = os.path.dirname(os.path.realpath(__file__))
expected_body_path = os.path.join(pwd, "gcs_pub_sub_body", expected_file_name)
with open(expected_body_path, "r") as f:
expected_request_body = json.load(f)
# Replace dynamic values in actual request body
FIELDS_TO_VALIDATE = [
"custom_llm_provider",
"hidden_params",
"messages",
"response",
"model",
"status",
"stream",
]
actual_request_body["response"]["id"] = expected_request_body["response"]["id"]
actual_request_body["response"]["created"] = expected_request_body["response"][
"created"
]
for field in FIELDS_TO_VALIDATE:
assert field in actual_request_body
FIELDS_EXISTENCE_CHECKS = [
"response_cost",
"response_time",
"completion_tokens",
"prompt_tokens",
"total_tokens",
]
for field in FIELDS_EXISTENCE_CHECKS:
assert field in actual_request_body
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_gcs_pub_sub(): async def test_async_gcs_pub_sub():
# Create a mock for the async_httpx_client's post method # Create a mock for the async_httpx_client's post method
@ -102,6 +159,61 @@ async def test_async_gcs_pub_sub():
decoded_message = base64.b64decode(encoded_message).decode("utf-8") decoded_message = base64.b64decode(encoded_message).decode("utf-8")
# Parse the JSON string into a dictionary
actual_request = json.loads(decoded_message)
print("##########\n")
print(json.dumps(actual_request, indent=4))
print("##########\n")
# Verify the request body matches expected format
assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
actual_request, "standard_logging_payload.json"
)
@pytest.mark.asyncio
async def test_async_gcs_pub_sub_v1():
# Create a mock for the async_httpx_client's post method
litellm.gcs_pub_sub_use_v1 = True
mock_post = AsyncMock()
mock_post.return_value.status_code = 202
mock_post.return_value.text = "Accepted"
# Initialize the GcsPubSubLogger and set the mock
gcs_pub_sub_logger = GcsPubSubLogger(flush_interval=1)
gcs_pub_sub_logger.async_httpx_client.post = mock_post
mock_construct_request_headers = AsyncMock()
mock_construct_request_headers.return_value = {"Authorization": "Bearer mock_token"}
gcs_pub_sub_logger.construct_request_headers = mock_construct_request_headers
litellm.callbacks = [gcs_pub_sub_logger]
# Make the completion call
response = await litellm.acompletion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello, world!"}],
mock_response="hi",
)
await asyncio.sleep(3) # Wait for async flush
# Assert httpx post was called
mock_post.assert_called_once()
# Get the actual request body from the mock
actual_url = mock_post.call_args[1]["url"]
print("sent to url", actual_url)
assert (
actual_url
== "https://pubsub.googleapis.com/v1/projects/reliableKeys/topics/litellmDB:publish"
)
actual_request = mock_post.call_args[1]["json"]
# Extract and decode the base64 encoded message
encoded_message = actual_request["messages"][0]["data"]
import base64
decoded_message = base64.b64decode(encoded_message).decode("utf-8")
# Parse the JSON string into a dictionary # Parse the JSON string into a dictionary
actual_request = json.loads(decoded_message) actual_request = json.loads(decoded_message)
print("##########\n") print("##########\n")

View file

@ -21,16 +21,18 @@ sys.path.insert(
import litellm import litellm
import asyncio import asyncio
from typing import Optional from typing import Optional
from litellm.types.utils import StandardLoggingPayload, Usage from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
class TestCustomLogger(CustomLogger): class TestCustomLogger(CustomLogger):
def __init__(self): def __init__(self):
self.recorded_usage: Optional[Usage] = None self.recorded_usage: Optional[Usage] = None
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
standard_logging_payload = kwargs.get("standard_logging_object") standard_logging_payload = kwargs.get("standard_logging_object")
self.standard_logging_payload = standard_logging_payload
print( print(
"standard_logging_payload", "standard_logging_payload",
json.dumps(standard_logging_payload, indent=4, default=str), json.dumps(standard_logging_payload, indent=4, default=str),