mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
Merge branch 'main' into litellm_exp_mcp_server
This commit is contained in:
commit
08a4ba1b7e
58 changed files with 2991 additions and 627 deletions
|
@ -1855,7 +1855,7 @@ jobs:
|
|||
command: |
|
||||
docker run -d \
|
||||
-p 4000:4000 \
|
||||
-e DATABASE_URL=$PROXY_DATABASE_URL \
|
||||
-e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \
|
||||
-e STORE_MODEL_IN_DB="True" \
|
||||
-e LITELLM_MASTER_KEY="sk-1234" \
|
||||
-e LITELLM_LICENSE=$LITELLM_LICENSE \
|
||||
|
|
|
@ -4,7 +4,8 @@ python-dotenv
|
|||
tiktoken
|
||||
importlib_metadata
|
||||
cohere
|
||||
redis
|
||||
redis==5.2.1
|
||||
redisvl==0.4.1
|
||||
anthropic
|
||||
orjson==3.9.15
|
||||
pydantic==2.10.2
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,4 @@
|
|||
.python-version
|
||||
.venv
|
||||
.env
|
||||
.newenv
|
||||
|
|
|
@ -37,9 +37,6 @@ RUN pip install dist/*.whl
|
|||
# install dependencies as wheels
|
||||
RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
|
||||
|
||||
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
|
||||
RUN pip install redisvl==0.0.7 --no-deps
|
||||
|
||||
# ensure pyjwt is used, not jwt
|
||||
RUN pip uninstall jwt -y
|
||||
RUN pip uninstall PyJWT -y
|
||||
|
|
|
@ -1,35 +1,5 @@
|
|||
version: "3.11"
|
||||
services:
|
||||
litellm:
|
||||
build:
|
||||
context: .
|
||||
args:
|
||||
target: runtime
|
||||
image: ghcr.io/berriai/litellm:main-stable
|
||||
#########################################
|
||||
## Uncomment these lines to start proxy with a config.yaml file ##
|
||||
# volumes:
|
||||
# - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
|
||||
# command:
|
||||
# - "--config=/app/config.yaml"
|
||||
##############################################
|
||||
ports:
|
||||
- "4000:4000" # Map the container port to the host, change the host port if necessary
|
||||
environment:
|
||||
DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
|
||||
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
|
||||
env_file:
|
||||
- .env # Load local .env file
|
||||
depends_on:
|
||||
- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
|
||||
healthcheck: # Defines the health check configuration for the container
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ] # Command to execute for health check
|
||||
interval: 30s # Perform health check every 30 seconds
|
||||
timeout: 10s # Health check command times out after 10 seconds
|
||||
retries: 3 # Retry up to 3 times if health check fails
|
||||
start_period: 40s # Wait 40 seconds after container start before beginning health checks
|
||||
|
||||
|
||||
db:
|
||||
image: postgres:16
|
||||
restart: always
|
||||
|
@ -46,25 +16,3 @@ services:
|
|||
interval: 1s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
ports:
|
||||
- "9090:9090"
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=15d'
|
||||
restart: always
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
postgres_data:
|
||||
name: litellm_postgres_data # Named volume for Postgres data persistence
|
||||
|
||||
|
||||
# ...rest of your docker-compose config if any
|
||||
|
|
|
@ -59,9 +59,6 @@ COPY --from=builder /wheels/ /wheels/
|
|||
# Install the built wheel using pip; again using a wildcard if it's the only file
|
||||
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
|
||||
|
||||
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
|
||||
RUN pip install redisvl==0.0.7 --no-deps
|
||||
|
||||
# ensure pyjwt is used, not jwt
|
||||
RUN pip uninstall jwt -y
|
||||
RUN pip uninstall PyJWT -y
|
||||
|
|
|
@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
|||
|
||||
# Install build dependencies
|
||||
RUN apt-get clean && apt-get update && \
|
||||
apt-get install -y gcc python3-dev && \
|
||||
apt-get install -y gcc g++ python3-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
|
@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/
|
|||
# Install the built wheel using pip; again using a wildcard if it's the only file
|
||||
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
|
||||
|
||||
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
|
||||
# ensure pyjwt is used, not jwt
|
||||
RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
|
||||
pip uninstall jwt -y && \
|
||||
RUN pip uninstall jwt -y && \
|
||||
pip uninstall PyJWT -y && \
|
||||
pip install PyJWT==2.9.0 --no-cache-dir
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ Install redis
|
|||
pip install redis
|
||||
```
|
||||
|
||||
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||
For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
@ -37,11 +37,11 @@ litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password
|
|||
|
||||
# Make completion calls
|
||||
response1 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}]
|
||||
)
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Tell me a joke."}]
|
||||
)
|
||||
|
||||
|
@ -91,12 +91,12 @@ response2 = completion(
|
|||
|
||||
<TabItem value="redis-sem" label="redis-semantic cache">
|
||||
|
||||
Install redis
|
||||
Install redisvl client
|
||||
```shell
|
||||
pip install redisvl==0.0.7
|
||||
pip install redisvl==0.4.1
|
||||
```
|
||||
|
||||
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||
For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
@ -114,6 +114,7 @@ litellm.cache = Cache(
|
|||
port=os.environ["REDIS_PORT"],
|
||||
password=os.environ["REDIS_PASSWORD"],
|
||||
similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
|
||||
ttl=120,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
|
||||
)
|
||||
response1 = completion(
|
||||
|
@ -471,11 +472,13 @@ def __init__(
|
|||
password: Optional[str] = None,
|
||||
namespace: Optional[str] = None,
|
||||
default_in_redis_ttl: Optional[float] = None,
|
||||
similarity_threshold: Optional[float] = None,
|
||||
redis_semantic_cache_use_async=False,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
redis_flush_size=None,
|
||||
|
||||
# redis semantic cache params
|
||||
similarity_threshold: Optional[float] = None,
|
||||
redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
|
||||
redis_semantic_cache_index_name: Optional[str] = None,
|
||||
|
||||
# s3 Bucket, boto3 configuration
|
||||
s3_bucket_name: Optional[str] = None,
|
||||
s3_region_name: Optional[str] = None,
|
||||
|
|
|
@ -200,3 +200,92 @@ Expected Response
|
|||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## OpenAI 'file' message type
|
||||
|
||||
This is currently only supported for OpenAI models.
|
||||
|
||||
This will be supported for all providers soon.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import base64
|
||||
from litellm import completion
|
||||
|
||||
with open("draconomicon.pdf", "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
base64_string = base64.b64encode(data).decode("utf-8")
|
||||
|
||||
completion = completion(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"filename": "draconomicon.pdf",
|
||||
"file_data": f"data:application/pdf;base64,{base64_string}",
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What is the first dragon in the book?",
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: openai-model
|
||||
litellm_params:
|
||||
model: gpt-4o
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "openai-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"filename": "draconomicon.pdf",
|
||||
"file_data": f"data:application/pdf;base64,{base64_string}",
|
||||
}
|
||||
}
|
||||
]}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
308
docs/my-website/docs/completion/web_search.md
Normal file
308
docs/my-website/docs/completion/web_search.md
Normal file
|
@ -0,0 +1,308 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Using Web Search
|
||||
|
||||
Use web search with litellm
|
||||
|
||||
| Feature | Details |
|
||||
|---------|---------|
|
||||
| Supported Endpoints | - `/chat/completions` <br/> - `/responses` |
|
||||
| Supported Providers | `openai` |
|
||||
| LiteLLM Cost Tracking | ✅ Supported |
|
||||
| LiteLLM Version | `v1.63.15-nightly` or higher |
|
||||
|
||||
|
||||
## `/chat/completions` (litellm.completion)
|
||||
|
||||
### Quick Start
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python showLineNumbers
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="openai/gpt-4o-search-preview",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?",
|
||||
}
|
||||
],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-4o-search-preview
|
||||
litellm_params:
|
||||
model: openai/gpt-4o-search-preview
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```python showLineNumbers
|
||||
from openai import OpenAI
|
||||
|
||||
# Point to your proxy server
|
||||
client = OpenAI(
|
||||
api_key="sk-1234",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-search-preview",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?"
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Search context size
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python showLineNumbers
|
||||
from litellm import completion
|
||||
|
||||
# Customize search context size
|
||||
response = completion(
|
||||
model="openai/gpt-4o-search-preview",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?",
|
||||
}
|
||||
],
|
||||
web_search_options={
|
||||
"search_context_size": "low" # Options: "low", "medium" (default), "high"
|
||||
}
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```python showLineNumbers
|
||||
from openai import OpenAI
|
||||
|
||||
# Point to your proxy server
|
||||
client = OpenAI(
|
||||
api_key="sk-1234",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
# Customize search context size
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-search-preview",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?"
|
||||
}
|
||||
],
|
||||
web_search_options={
|
||||
"search_context_size": "low" # Options: "low", "medium" (default), "high"
|
||||
}
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## `/responses` (litellm.responses)
|
||||
|
||||
### Quick Start
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python showLineNumbers
|
||||
from litellm import responses
|
||||
|
||||
response = responses(
|
||||
model="openai/gpt-4o",
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?"
|
||||
}
|
||||
],
|
||||
tools=[{
|
||||
"type": "web_search_preview" # enables web search with default medium context size
|
||||
}]
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-4o
|
||||
litellm_params:
|
||||
model: openai/gpt-4o
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```python showLineNumbers
|
||||
from openai import OpenAI
|
||||
|
||||
# Point to your proxy server
|
||||
client = OpenAI(
|
||||
api_key="sk-1234",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.responses.create(
|
||||
model="gpt-4o",
|
||||
tools=[{
|
||||
"type": "web_search_preview"
|
||||
}],
|
||||
input="What was a positive news story from today?",
|
||||
)
|
||||
|
||||
print(response.output_text)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Search context size
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python showLineNumbers
|
||||
from litellm import responses
|
||||
|
||||
# Customize search context size
|
||||
response = responses(
|
||||
model="openai/gpt-4o",
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What was a positive news story from today?"
|
||||
}
|
||||
],
|
||||
tools=[{
|
||||
"type": "web_search_preview",
|
||||
"search_context_size": "low" # Options: "low", "medium" (default), "high"
|
||||
}]
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```python showLineNumbers
|
||||
from openai import OpenAI
|
||||
|
||||
# Point to your proxy server
|
||||
client = OpenAI(
|
||||
api_key="sk-1234",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
# Customize search context size
|
||||
response = client.responses.create(
|
||||
model="gpt-4o",
|
||||
tools=[{
|
||||
"type": "web_search_preview",
|
||||
"search_context_size": "low" # Options: "low", "medium" (default), "high"
|
||||
}],
|
||||
input="What was a positive news story from today?",
|
||||
)
|
||||
|
||||
print(response.output_text)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Checking if a model supports web search
|
||||
|
||||
<Tabs>
|
||||
<TabItem label="SDK" value="sdk">
|
||||
|
||||
Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches
|
||||
|
||||
```python showLineNumbers
|
||||
assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem label="PROXY" value="proxy">
|
||||
|
||||
1. Define OpenAI models in config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-4o-search-preview
|
||||
litellm_params:
|
||||
model: openai/gpt-4o-search-preview
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
model_info:
|
||||
supports_web_search: True
|
||||
```
|
||||
|
||||
2. Run proxy server
|
||||
|
||||
```bash
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
3. Call `/model_group/info` to check if a model supports web search
|
||||
|
||||
```shell
|
||||
curl -X 'GET' \
|
||||
'http://localhost:4000/model_group/info' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'x-api-key: sk-1234'
|
||||
```
|
||||
|
||||
Expected Response
|
||||
|
||||
```json showLineNumbers
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"model_group": "gpt-4o-search-preview",
|
||||
"providers": ["openai"],
|
||||
"max_tokens": 128000,
|
||||
"supports_web_search": true, # 👈 supports_web_search is true
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
66
docs/my-website/docs/guides/security_settings.md
Normal file
66
docs/my-website/docs/guides/security_settings.md
Normal file
|
@ -0,0 +1,66 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# SSL Security Settings
|
||||
|
||||
If you're in an environment using an older TTS bundle, with an older encryption, follow this guide.
|
||||
|
||||
|
||||
LiteLLM uses HTTPX for network requests, unless otherwise specified.
|
||||
|
||||
1. Disable SSL verification
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
litellm.ssl_verify = False
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
ssl_verify: false
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="env_var" label="Environment Variables">
|
||||
|
||||
```bash
|
||||
export SSL_VERIFY="False"
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
2. Lower security settings
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
litellm.ssl_security_level = 1
|
||||
litellm.ssl_certificate = "/path/to/certificate.pem"
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
ssl_security_level: 1
|
||||
ssl_certificate: "/path/to/certificate.pem"
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="env_var" label="Environment Variables">
|
||||
|
||||
```bash
|
||||
export SSL_SECURITY_LEVEL="1"
|
||||
export SSL_CERTIFICATE="/path/to/certificate.pem"
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
|
@ -1,4 +1,7 @@
|
|||
|
||||
import Image from '@theme/IdealImage';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Arize AI
|
||||
|
||||
|
@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm
|
|||
|
||||
:::
|
||||
|
||||
<Image img={require('../../img/arize.png')} />
|
||||
|
||||
|
||||
|
||||
## Pre-Requisites
|
||||
|
@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can
|
|||
```python
|
||||
litellm.callbacks = ["arize"]
|
||||
```
|
||||
|
||||
```python
|
||||
|
||||
import litellm
|
||||
import os
|
||||
|
||||
|
@ -48,7 +55,7 @@ response = litellm.completion(
|
|||
|
||||
### Using with LiteLLM Proxy
|
||||
|
||||
|
||||
1. Setup config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-4
|
||||
|
@ -60,13 +67,134 @@ model_list:
|
|||
litellm_settings:
|
||||
callbacks: ["arize"]
|
||||
|
||||
general_settings:
|
||||
master_key: "sk-1234" # can also be set as an environment variable
|
||||
|
||||
environment_variables:
|
||||
ARIZE_SPACE_KEY: "d0*****"
|
||||
ARIZE_API_KEY: "141a****"
|
||||
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
|
||||
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
|
||||
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc)
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}'
|
||||
```
|
||||
|
||||
## Pass Arize Space/Key per-request
|
||||
|
||||
Supported parameters:
|
||||
- `arize_api_key`
|
||||
- `arize_space_key`
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import os
|
||||
|
||||
# LLM API Keys
|
||||
os.environ['OPENAI_API_KEY']=""
|
||||
|
||||
# set arize as a callback, litellm will send the data to arize
|
||||
litellm.callbacks = ["arize"]
|
||||
|
||||
# openai call
|
||||
response = litellm.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hi 👋 - i'm openai"}
|
||||
],
|
||||
arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"),
|
||||
arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"),
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-4
|
||||
litellm_params:
|
||||
model: openai/fake
|
||||
api_key: fake-key
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
|
||||
litellm_settings:
|
||||
callbacks: ["arize"]
|
||||
|
||||
general_settings:
|
||||
master_key: "sk-1234" # can also be set as an environment variable
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="curl" label="CURL">
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "gpt-4",
|
||||
"messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}],
|
||||
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
|
||||
"arize_space_key": "ARIZE_SPACE_2_KEY"
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai_python" label="OpenAI Python">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"arize_api_key": "ARIZE_SPACE_2_API_KEY",
|
||||
"arize_space_key": "ARIZE_SPACE_2_KEY"
|
||||
}
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Support & Talk to Founders
|
||||
|
||||
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
|
||||
|
|
|
@ -291,14 +291,15 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
## Azure O1 Models
|
||||
## O-Series Models
|
||||
|
||||
| Model Name | Function Call |
|
||||
|---------------------|----------------------------------------------------|
|
||||
| o1-mini | `response = completion(model="azure/<your deployment name>", messages=messages)` |
|
||||
| o1-preview | `response = completion(model="azure/<your deployment name>", messages=messages)` |
|
||||
Azure OpenAI O-Series models are supported on LiteLLM.
|
||||
|
||||
Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support.
|
||||
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
|
||||
|
||||
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
|
||||
|
||||
**Automatic Routing**
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
@ -306,60 +307,112 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami
|
|||
```python
|
||||
import litellm
|
||||
|
||||
litellm.enable_preview_features = True # 👈 KEY CHANGE
|
||||
|
||||
response = litellm.completion(
|
||||
model="azure/<your deployment name>",
|
||||
messages=[{"role": "user", "content": "What is the weather like in Boston?"}],
|
||||
stream=True
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="Proxy">
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: o1-mini
|
||||
- model_name: o3-mini
|
||||
litellm_params:
|
||||
model: azure/o1-mini
|
||||
api_base: "os.environ/AZURE_API_BASE"
|
||||
api_key: "os.environ/AZURE_API_KEY"
|
||||
api_version: "os.environ/AZURE_API_VERSION"
|
||||
|
||||
litellm_settings:
|
||||
enable_preview_features: true # 👈 KEY CHANGE
|
||||
model: azure/o3-model
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
```
|
||||
|
||||
2. Start proxy
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Explicit Routing**
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: o3-mini
|
||||
litellm_params:
|
||||
model: azure/o_series/my-random-deployment-name
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Azure Audio Model
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ["AZURE_API_KEY"] = ""
|
||||
os.environ["AZURE_API_BASE"] = ""
|
||||
os.environ["AZURE_API_VERSION"] = ""
|
||||
|
||||
response = completion(
|
||||
model="azure/azure-openai-4o-audio",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "I want to try out speech to speech"
|
||||
}
|
||||
],
|
||||
modalities=["text","audio"],
|
||||
audio={"voice": "alloy", "format": "wav"}
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: azure-openai-4o-audio
|
||||
litellm_params:
|
||||
model: azure/azure-openai-4o-audio
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: os.environ/AZURE_API_VERSION
|
||||
```
|
||||
|
||||
2. Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it
|
||||
3. Test it!
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(model="o1-mini", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
],
|
||||
stream=True)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```bash
|
||||
curl http://localhost:4000/v1/chat/completions \
|
||||
-H "Authorization: Bearer $LITELLM_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "azure-openai-4o-audio",
|
||||
"messages": [{"role": "user", "content": "I want to try out speech to speech"}],
|
||||
"modalities": ["text","audio"],
|
||||
"audio": {"voice": "alloy", "format": "wav"}
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
@ -948,62 +1001,9 @@ Expected Response:
|
|||
{"data":[{"id":"batch_R3V...}
|
||||
```
|
||||
|
||||
## O-Series Models
|
||||
|
||||
Azure OpenAI O-Series models are supported on LiteLLM.
|
||||
|
||||
LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic.
|
||||
|
||||
To set this explicitly, set `model` to `azure/o_series/<your-deployment-name>`.
|
||||
|
||||
**Automatic Routing**
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: o3-mini
|
||||
litellm_params:
|
||||
model: azure/o3-model
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Explicit Routing**
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
||||
litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: o3-mini
|
||||
litellm_params:
|
||||
model: azure/o_series/my-random-deployment-name
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1428,10 +1428,14 @@ response = litellm.embedding(
|
|||
|
||||
|
||||
## Supported AWS Bedrock Models
|
||||
|
||||
LiteLLM supports ALL Bedrock models.
|
||||
|
||||
Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
|
||||
|
||||
| Model Name | Command |
|
||||
|----------------------------|------------------------------------------------------------------|
|
||||
| Deepseek R1 | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
|
|
|
@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Using Ollama FIM on `/v1/completions`
|
||||
|
||||
LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
litellm._turn_on_debug() # turn on debug to see the request
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="ollama/llama3.1",
|
||||
prompt="Hello, world!",
|
||||
api_base="http://localhost:11434"
|
||||
)
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: "llama3.1"
|
||||
litellm_params:
|
||||
model: "ollama/llama3.1"
|
||||
api_base: "http://localhost:11434"
|
||||
```
|
||||
|
||||
2. Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml --detailed_debug
|
||||
|
||||
# RUNNING ON http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
|
||||
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
|
||||
)
|
||||
|
||||
response = client.completions.create(
|
||||
model="ollama/llama3.1",
|
||||
prompt="Hello, world!",
|
||||
api_base="http://localhost:11434"
|
||||
)
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Using ollama `api/chat`
|
||||
In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`
|
||||
|
||||
|
|
|
@ -228,6 +228,92 @@ response = completion(
|
|||
|
||||
```
|
||||
|
||||
## PDF File Parsing
|
||||
|
||||
OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python)
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import base64
|
||||
from litellm import completion
|
||||
|
||||
with open("draconomicon.pdf", "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
base64_string = base64.b64encode(data).decode("utf-8")
|
||||
|
||||
completion = completion(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"filename": "draconomicon.pdf",
|
||||
"file_data": f"data:application/pdf;base64,{base64_string}",
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What is the first dragon in the book?",
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: openai-model
|
||||
litellm_params:
|
||||
model: gpt-4o
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-d '{
|
||||
"model": "openai-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"filename": "draconomicon.pdf",
|
||||
"file_data": f"data:application/pdf;base64,{base64_string}",
|
||||
}
|
||||
}
|
||||
]}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## OpenAI Fine Tuned Models
|
||||
|
||||
| Model Name | Function Call |
|
||||
|
@ -449,26 +535,6 @@ response = litellm.acompletion(
|
|||
)
|
||||
```
|
||||
|
||||
### Using Helicone Proxy with LiteLLM
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = ""
|
||||
|
||||
# os.environ["OPENAI_API_BASE"] = ""
|
||||
litellm.api_base = "https://oai.hconeai.com/v1"
|
||||
litellm.headers = {
|
||||
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
|
||||
"Helicone-Cache-Enabled": "true",
|
||||
}
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
|
||||
# openai call
|
||||
response = completion("gpt-3.5-turbo", messages)
|
||||
```
|
||||
|
||||
### Using OpenAI Proxy with LiteLLM
|
||||
```python
|
||||
|
|
|
@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o
|
|||
import os
|
||||
from litellm import completion
|
||||
os.environ["OPENROUTER_API_KEY"] = ""
|
||||
os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1
|
||||
|
||||
os.environ["OR_SITE_URL"] = "" # optional
|
||||
os.environ["OR_APP_NAME"] = "" # optional
|
||||
|
||||
os.environ["OR_SITE_URL"] = "" # [OPTIONAL]
|
||||
os.environ["OR_APP_NAME"] = "" # [OPTIONAL]
|
||||
|
||||
response = completion(
|
||||
model="openrouter/google/palm-2-chat-bison",
|
||||
|
|
|
@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
|
|||
response: str,
|
||||
):
|
||||
pass
|
||||
|
||||
aasync def async_post_call_streaming_iterator_hook(
|
||||
self,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
response: Any,
|
||||
request_data: dict,
|
||||
) -> AsyncGenerator[ModelResponseStream, None]:
|
||||
"""
|
||||
Passes the entire stream to the guardrail
|
||||
|
||||
This is useful for plugins that need to see the entire stream.
|
||||
"""
|
||||
async for item in response:
|
||||
yield item
|
||||
|
||||
proxy_handler_instance = MyCustomHandler()
|
||||
```
|
||||
|
||||
|
|
|
@ -147,6 +147,7 @@ general_settings:
|
|||
|------|------|-------------|
|
||||
| completion_model | string | The default model to use for completions when `model` is not specified in the request |
|
||||
| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
|
||||
| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. |
|
||||
| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
|
||||
| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
|
||||
| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
|
||||
|
|
|
@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail
|
|||
|
||||
### 1. Write a `CustomGuardrail` Class
|
||||
|
||||
A CustomGuardrail has 3 methods to enforce guardrails
|
||||
A CustomGuardrail has 4 methods to enforce guardrails
|
||||
- `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call
|
||||
- `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency)
|
||||
- `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call
|
||||
- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail
|
||||
|
||||
|
||||
**[See detailed spec of methods here](#customguardrail-methods)**
|
||||
|
||||
|
@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail):
|
|||
):
|
||||
raise ValueError("Guardrail failed Coffee Detected")
|
||||
|
||||
async def async_post_call_streaming_iterator_hook(
|
||||
self,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
response: Any,
|
||||
request_data: dict,
|
||||
) -> AsyncGenerator[ModelResponseStream, None]:
|
||||
"""
|
||||
Passes the entire stream to the guardrail
|
||||
|
||||
This is useful for guardrails that need to see the entire response, such as PII masking.
|
||||
|
||||
See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168
|
||||
|
||||
Triggered by mode: 'post_call'
|
||||
"""
|
||||
async for item in response:
|
||||
yield item
|
||||
|
||||
```
|
||||
|
||||
|
|
|
@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
|
|||
| `response_cost` | `Optional[str]` | Optional response cost |
|
||||
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
|
||||
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
|
||||
| `litellm_model_name` | `Optional[str]` | Model name sent in request |
|
||||
|
||||
## StandardLoggingModelInformation
|
||||
|
||||
|
|
|
@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status
|
|||
| `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed |
|
||||
|
||||
## Cost Tracking Headers
|
||||
| Header | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `x-litellm-response-cost` | float | Cost of the API call |
|
||||
| `x-litellm-key-spend` | float | Total spend for the API key |
|
||||
| Header | Type | Description | Available on Pass-Through Endpoints |
|
||||
|--------|------|-------------|-------------|
|
||||
| `x-litellm-response-cost` | float | Cost of the API call | |
|
||||
| `x-litellm-key-spend` | float | Total spend for the API key | ✅ |
|
||||
|
||||
## LiteLLM Specific Headers
|
||||
| Header | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `x-litellm-call-id` | string | Unique identifier for the API call |
|
||||
| `x-litellm-model-id` | string | Unique identifier for the model used |
|
||||
| `x-litellm-model-api-base` | string | Base URL of the API endpoint |
|
||||
| `x-litellm-version` | string | Version of LiteLLM being used |
|
||||
| `x-litellm-model-group` | string | Model group identifier |
|
||||
| Header | Type | Description | Available on Pass-Through Endpoints |
|
||||
|--------|------|-------------|-------------|
|
||||
| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ |
|
||||
| `x-litellm-model-id` | string | Unique identifier for the model used | |
|
||||
| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ |
|
||||
| `x-litellm-version` | string | Version of LiteLLM being used | |
|
||||
| `x-litellm-model-group` | string | Model group identifier | |
|
||||
|
||||
## Response headers from LLM providers
|
||||
|
||||
|
|
BIN
docs/my-website/img/arize.png
Normal file
BIN
docs/my-website/img/arize.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 707 KiB |
|
@ -26,14 +26,6 @@ This release is primarily focused on:
|
|||
- UI - Credential Management, re-use credentials when adding new models
|
||||
- UI - Test Connection to LLM Provider before adding a model
|
||||
|
||||
:::info
|
||||
|
||||
This release will be live on 03/16/2025
|
||||
|
||||
:::
|
||||
|
||||
<!-- <Image img={require('../../img/release_notes/v16311_release.jpg')} /> -->
|
||||
|
||||
## Known Issues
|
||||
- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test
|
||||
|
||||
|
|
130
docs/my-website/release_notes/v1.63.14/index.md
Normal file
130
docs/my-website/release_notes/v1.63.14/index.md
Normal file
|
@ -0,0 +1,130 @@
|
|||
---
|
||||
title: v1.63.14-stable
|
||||
slug: v1.63.14-stable
|
||||
date: 2025-03-22T10:00:00
|
||||
authors:
|
||||
- name: Krrish Dholakia
|
||||
title: CEO, LiteLLM
|
||||
url: https://www.linkedin.com/in/krish-d/
|
||||
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
|
||||
- name: Ishaan Jaffer
|
||||
title: CTO, LiteLLM
|
||||
url: https://www.linkedin.com/in/reffajnaahsi/
|
||||
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
|
||||
|
||||
tags: [credential management, thinking content, responses api, snowflake]
|
||||
hide_table_of_contents: false
|
||||
---
|
||||
|
||||
import Image from '@theme/IdealImage';
|
||||
|
||||
These are the changes since `v1.63.11-stable`.
|
||||
|
||||
This release brings:
|
||||
- LLM Translation Improvements (MCP Support and Bedrock Application Profiles)
|
||||
- Perf improvements for Usage-based Routing
|
||||
- Streaming guardrail support via websockets
|
||||
|
||||
## Docker Run LiteLLM Proxy
|
||||
|
||||
```
|
||||
docker run
|
||||
-e STORE_MODEL_IN_DB=True
|
||||
-p 4000:4000
|
||||
ghcr.io/berriai/litellm:main-v1.63.14-stable
|
||||
```
|
||||
|
||||
## Demo Instance
|
||||
|
||||
Here's a Demo Instance to test changes:
|
||||
- Instance: https://demo.litellm.ai/
|
||||
- Login Credentials:
|
||||
- Username: admin
|
||||
- Password: sk-1234
|
||||
|
||||
|
||||
|
||||
## New Models / Updated Models
|
||||
|
||||
- Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361)
|
||||
- O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397)
|
||||
- Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
|
||||
- Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453)
|
||||
|
||||
|
||||
|
||||
## LLM Translation
|
||||
|
||||
1. **New LLM Features**
|
||||
|
||||
- Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile)
|
||||
- Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`)
|
||||
- Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions)
|
||||
- Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models)
|
||||
- OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md)
|
||||
- Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model)
|
||||
- OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type)
|
||||
- OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming)
|
||||
- [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp)
|
||||
|
||||
2. **Bug Fixes**
|
||||
|
||||
- Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e)
|
||||
- Sagemaker - Fix ‘Too little data for declared Content-Length’ error - [PR](https://github.com/BerriAI/litellm/pull/9326)
|
||||
- OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355)
|
||||
- VertexAI - Embedding ‘outputDimensionality’ support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304)
|
||||
- Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437)
|
||||
|
||||
## Spend Tracking Improvements
|
||||
|
||||
- `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk
|
||||
- Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329)
|
||||
- Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314)
|
||||
|
||||
|
||||
## UI
|
||||
|
||||
1. Users Page
|
||||
- Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328)
|
||||
2. Icons:
|
||||
- Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374)
|
||||
3. Sign In/Sign Out
|
||||
- Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395)
|
||||
|
||||
|
||||
## Logging Integrations
|
||||
|
||||
- Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class)
|
||||
- Arize [Get Started](../../docs/observability/arize_integration)
|
||||
- fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338)
|
||||
- migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338)
|
||||
- fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353)
|
||||
- Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request)
|
||||
- StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams)
|
||||
- Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md)
|
||||
|
||||
## Performance / Reliability improvements
|
||||
|
||||
- Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf)
|
||||
- Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331)
|
||||
- Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
|
||||
- Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413)
|
||||
- Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support)
|
||||
- Usage-based routing - Support batch writing increments to redis - reduces latency to same as ‘simple-shuffle’ [PR](https://github.com/BerriAI/litellm/pull/9357)
|
||||
- Router - show reason for model cooldown on ‘no healthy deployments available error’ [PR](https://github.com/BerriAI/litellm/pull/9438)
|
||||
- Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image url’s being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448)
|
||||
|
||||
|
||||
## General Improvements
|
||||
|
||||
- Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers)
|
||||
- SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings)
|
||||
- Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376)
|
||||
- Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling)
|
||||
- OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a)
|
||||
- Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb)
|
||||
|
||||
|
||||
## Complete Git Diff
|
||||
|
||||
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc)
|
|
@ -243,7 +243,9 @@ const sidebars = {
|
|||
"exception_mapping",
|
||||
"completion/provider_specific_params",
|
||||
"guides/finetuned_models",
|
||||
"guides/security_settings",
|
||||
"completion/audio",
|
||||
"completion/web_search",
|
||||
"completion/document_understanding",
|
||||
"completion/vision",
|
||||
"completion/json_mode",
|
||||
|
|
|
@ -122,6 +122,9 @@ langsmith_batch_size: Optional[int] = None
|
|||
prometheus_initialize_budget_metrics: Optional[bool] = False
|
||||
argilla_batch_size: Optional[int] = None
|
||||
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
|
||||
gcs_pub_sub_use_v1: Optional[bool] = (
|
||||
False # if you want to use v1 gcs pubsub logged payload
|
||||
)
|
||||
argilla_transformation_object: Optional[Dict[str, Any]] = None
|
||||
_async_input_callback: List[Union[str, Callable, CustomLogger]] = (
|
||||
[]
|
||||
|
@ -756,6 +759,7 @@ from .utils import (
|
|||
create_pretrained_tokenizer,
|
||||
create_tokenizer,
|
||||
supports_function_calling,
|
||||
supports_web_search,
|
||||
supports_response_schema,
|
||||
supports_parallel_function_calling,
|
||||
supports_vision,
|
||||
|
|
|
@ -88,16 +88,16 @@ class Cache:
|
|||
s3_aws_session_token: Optional[str] = None,
|
||||
s3_config: Optional[Any] = None,
|
||||
s3_path: Optional[str] = None,
|
||||
redis_semantic_cache_use_async=False,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
|
||||
redis_semantic_cache_index_name: Optional[str] = None,
|
||||
redis_flush_size: Optional[int] = None,
|
||||
redis_startup_nodes: Optional[List] = None,
|
||||
disk_cache_dir=None,
|
||||
disk_cache_dir: Optional[str] = None,
|
||||
qdrant_api_base: Optional[str] = None,
|
||||
qdrant_api_key: Optional[str] = None,
|
||||
qdrant_collection_name: Optional[str] = None,
|
||||
qdrant_quantization_config: Optional[str] = None,
|
||||
qdrant_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
@ -170,8 +170,8 @@ class Cache:
|
|||
port=port,
|
||||
password=password,
|
||||
similarity_threshold=similarity_threshold,
|
||||
use_async=redis_semantic_cache_use_async,
|
||||
embedding_model=redis_semantic_cache_embedding_model,
|
||||
index_name=redis_semantic_cache_index_name,
|
||||
**kwargs,
|
||||
)
|
||||
elif type == LiteLLMCacheType.QDRANT_SEMANTIC:
|
||||
|
|
|
@ -1,337 +1,437 @@
|
|||
"""
|
||||
Redis Semantic Cache implementation
|
||||
Redis Semantic Cache implementation for LiteLLM
|
||||
|
||||
Has 4 methods:
|
||||
- set_cache
|
||||
- get_cache
|
||||
- async_set_cache
|
||||
- async_get_cache
|
||||
The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
|
||||
This cache stores responses based on the semantic similarity of prompts rather than
|
||||
exact matching, allowing for more flexible caching of LLM responses.
|
||||
|
||||
This implementation uses RedisVL's SemanticCache to find semantically similar prompts
|
||||
and their cached responses.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import litellm
|
||||
from litellm._logging import print_verbose
|
||||
|
||||
from litellm.litellm_core_utils.prompt_templates.common_utils import get_str_from_messages
|
||||
from .base_cache import BaseCache
|
||||
|
||||
|
||||
class RedisSemanticCache(BaseCache):
|
||||
"""
|
||||
Redis-backed semantic cache for LLM responses.
|
||||
|
||||
This cache uses vector similarity to find semantically similar prompts that have been
|
||||
previously sent to the LLM, allowing for cache hits even when prompts are not identical
|
||||
but carry similar meaning.
|
||||
"""
|
||||
|
||||
DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host=None,
|
||||
port=None,
|
||||
password=None,
|
||||
redis_url=None,
|
||||
similarity_threshold=None,
|
||||
use_async=False,
|
||||
embedding_model="text-embedding-ada-002",
|
||||
host: Optional[str] = None,
|
||||
port: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
redis_url: Optional[str] = None,
|
||||
similarity_threshold: Optional[float] = None,
|
||||
embedding_model: str = "text-embedding-ada-002",
|
||||
index_name: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
from redisvl.index import SearchIndex
|
||||
|
||||
print_verbose(
|
||||
"redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
|
||||
)
|
||||
if similarity_threshold is None:
|
||||
raise Exception("similarity_threshold must be provided, passed None")
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_model = embedding_model
|
||||
schema = {
|
||||
"index": {
|
||||
"name": "litellm_semantic_cache_index",
|
||||
"prefix": "litellm",
|
||||
"storage_type": "hash",
|
||||
},
|
||||
"fields": {
|
||||
"text": [{"name": "response"}],
|
||||
"vector": [
|
||||
{
|
||||
"name": "litellm_embedding",
|
||||
"dims": 1536,
|
||||
"distance_metric": "cosine",
|
||||
"algorithm": "flat",
|
||||
"datatype": "float32",
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
if redis_url is None:
|
||||
# if no url passed, check if host, port and password are passed, if not raise an Exception
|
||||
if host is None or port is None or password is None:
|
||||
# try checking env for host, port and password
|
||||
import os
|
||||
|
||||
host = os.getenv("REDIS_HOST")
|
||||
port = os.getenv("REDIS_PORT")
|
||||
password = os.getenv("REDIS_PASSWORD")
|
||||
if host is None or port is None or password is None:
|
||||
raise Exception("Redis host, port, and password must be provided")
|
||||
|
||||
redis_url = "redis://:" + password + "@" + host + ":" + port
|
||||
print_verbose(f"redis semantic-cache redis_url: {redis_url}")
|
||||
if use_async is False:
|
||||
self.index = SearchIndex.from_dict(schema)
|
||||
self.index.connect(redis_url=redis_url)
|
||||
try:
|
||||
self.index.create(overwrite=False) # don't overwrite existing index
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
|
||||
elif use_async is True:
|
||||
schema["index"]["name"] = "litellm_semantic_cache_index_async"
|
||||
self.index = SearchIndex.from_dict(schema)
|
||||
self.index.connect(redis_url=redis_url, use_async=True)
|
||||
|
||||
#
|
||||
def _get_cache_logic(self, cached_response: Any):
|
||||
"""
|
||||
Common 'get_cache_logic' across sync + async redis client implementations
|
||||
Initialize the Redis Semantic Cache.
|
||||
|
||||
Args:
|
||||
host: Redis host address
|
||||
port: Redis port
|
||||
password: Redis password
|
||||
redis_url: Full Redis URL (alternative to separate host/port/password)
|
||||
similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
|
||||
where 1.0 requires exact matches and 0.0 accepts any match
|
||||
embedding_model: Model to use for generating embeddings
|
||||
index_name: Name for the Redis index
|
||||
ttl: Default time-to-live for cache entries in seconds
|
||||
**kwargs: Additional arguments passed to the Redis client
|
||||
|
||||
Raises:
|
||||
Exception: If similarity_threshold is not provided or required Redis
|
||||
connection information is missing
|
||||
"""
|
||||
from redisvl.extensions.llmcache import SemanticCache
|
||||
from redisvl.utils.vectorize import CustomTextVectorizer
|
||||
|
||||
if index_name is None:
|
||||
index_name = self.DEFAULT_REDIS_INDEX_NAME
|
||||
|
||||
print_verbose(f"Redis semantic-cache initializing index - {index_name}")
|
||||
|
||||
# Validate similarity threshold
|
||||
if similarity_threshold is None:
|
||||
raise ValueError("similarity_threshold must be provided, passed None")
|
||||
|
||||
# Store configuration
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
# Convert similarity threshold [0,1] to distance threshold [0,2]
|
||||
# For cosine distance: 0 = most similar, 2 = least similar
|
||||
# While similarity: 1 = most similar, 0 = least similar
|
||||
self.distance_threshold = 1 - similarity_threshold
|
||||
self.embedding_model = embedding_model
|
||||
|
||||
# Set up Redis connection
|
||||
if redis_url is None:
|
||||
try:
|
||||
# Attempt to use provided parameters or fallback to environment variables
|
||||
host = host or os.environ['REDIS_HOST']
|
||||
port = port or os.environ['REDIS_PORT']
|
||||
password = password or os.environ['REDIS_PASSWORD']
|
||||
except KeyError as e:
|
||||
# Raise a more informative exception if any of the required keys are missing
|
||||
missing_var = e.args[0]
|
||||
raise ValueError(f"Missing required Redis configuration: {missing_var}. "
|
||||
f"Provide {missing_var} or redis_url.") from e
|
||||
|
||||
redis_url = f"redis://:{password}@{host}:{port}"
|
||||
|
||||
print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
|
||||
|
||||
# Initialize the Redis vectorizer and cache
|
||||
cache_vectorizer = CustomTextVectorizer(self._get_embedding)
|
||||
|
||||
self.llmcache = SemanticCache(
|
||||
name=index_name,
|
||||
redis_url=redis_url,
|
||||
vectorizer=cache_vectorizer,
|
||||
distance_threshold=self.distance_threshold,
|
||||
overwrite=False,
|
||||
)
|
||||
|
||||
def _get_ttl(self, **kwargs) -> Optional[int]:
|
||||
"""
|
||||
Get the TTL (time-to-live) value for cache entries.
|
||||
|
||||
Args:
|
||||
**kwargs: Keyword arguments that may contain a custom TTL
|
||||
|
||||
Returns:
|
||||
Optional[int]: The TTL value in seconds, or None if no TTL should be applied
|
||||
"""
|
||||
ttl = kwargs.get("ttl")
|
||||
if ttl is not None:
|
||||
ttl = int(ttl)
|
||||
return ttl
|
||||
|
||||
def _get_embedding(self, prompt: str) -> List[float]:
|
||||
"""
|
||||
Generate an embedding vector for the given prompt using the configured embedding model.
|
||||
|
||||
Args:
|
||||
prompt: The text to generate an embedding for
|
||||
|
||||
Returns:
|
||||
List[float]: The embedding vector
|
||||
"""
|
||||
# Create an embedding from prompt
|
||||
embedding_response = litellm.embedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
return embedding
|
||||
|
||||
def _get_cache_logic(self, cached_response: Any) -> Any:
|
||||
"""
|
||||
Process the cached response to prepare it for use.
|
||||
|
||||
Args:
|
||||
cached_response: The raw cached response
|
||||
|
||||
Returns:
|
||||
The processed cache response, or None if input was None
|
||||
"""
|
||||
if cached_response is None:
|
||||
return cached_response
|
||||
|
||||
# check if cached_response is bytes
|
||||
# Convert bytes to string if needed
|
||||
if isinstance(cached_response, bytes):
|
||||
cached_response = cached_response.decode("utf-8")
|
||||
|
||||
# Convert string representation to Python object
|
||||
try:
|
||||
cached_response = json.loads(
|
||||
cached_response
|
||||
) # Convert string to dictionary
|
||||
except Exception:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
cached_response = json.loads(cached_response)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
except (ValueError, SyntaxError) as e:
|
||||
print_verbose(f"Error parsing cached response: {str(e)}")
|
||||
return None
|
||||
|
||||
return cached_response
|
||||
|
||||
def set_cache(self, key, value, **kwargs):
|
||||
import numpy as np
|
||||
|
||||
print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
|
||||
# get the prompt
|
||||
messages = kwargs["messages"]
|
||||
prompt = "".join(message["content"] for message in messages)
|
||||
|
||||
# create an embedding for prompt
|
||||
embedding_response = litellm.embedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
# make the embedding a numpy array, convert to bytes
|
||||
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
|
||||
value = str(value)
|
||||
assert isinstance(value, str)
|
||||
|
||||
new_data = [
|
||||
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
|
||||
]
|
||||
|
||||
# Add more data
|
||||
self.index.load(new_data)
|
||||
|
||||
return
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
from redisvl.query import VectorQuery
|
||||
|
||||
# query
|
||||
# get the messages
|
||||
messages = kwargs["messages"]
|
||||
prompt = "".join(message["content"] for message in messages)
|
||||
|
||||
# convert to embedding
|
||||
embedding_response = litellm.embedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
query = VectorQuery(
|
||||
vector=embedding,
|
||||
vector_field_name="litellm_embedding",
|
||||
return_fields=["response", "prompt", "vector_distance"],
|
||||
num_results=1,
|
||||
)
|
||||
|
||||
results = self.index.query(query)
|
||||
if results is None:
|
||||
return None
|
||||
if isinstance(results, list):
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
vector_distance = results[0]["vector_distance"]
|
||||
vector_distance = float(vector_distance)
|
||||
similarity = 1 - vector_distance
|
||||
cached_prompt = results[0]["prompt"]
|
||||
|
||||
# check similarity, if more than self.similarity_threshold, return results
|
||||
print_verbose(
|
||||
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
|
||||
)
|
||||
if similarity > self.similarity_threshold:
|
||||
# cache hit !
|
||||
cached_value = results[0]["response"]
|
||||
print_verbose(
|
||||
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
|
||||
)
|
||||
return self._get_cache_logic(cached_response=cached_value)
|
||||
else:
|
||||
# cache miss !
|
||||
return None
|
||||
|
||||
pass
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
import numpy as np
|
||||
|
||||
from litellm.proxy.proxy_server import llm_model_list, llm_router
|
||||
def set_cache(self, key: str, value: Any, **kwargs) -> None:
|
||||
"""
|
||||
Store a value in the semantic cache.
|
||||
|
||||
Args:
|
||||
key: The cache key (not directly used in semantic caching)
|
||||
value: The response value to cache
|
||||
**kwargs: Additional arguments including 'messages' for the prompt
|
||||
and optional 'ttl' for time-to-live
|
||||
"""
|
||||
print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
|
||||
try:
|
||||
await self.index.acreate(overwrite=False) # don't overwrite existing index
|
||||
# Extract the prompt from messages
|
||||
messages = kwargs.get("messages", [])
|
||||
if not messages:
|
||||
print_verbose("No messages provided for semantic caching")
|
||||
return
|
||||
|
||||
prompt = get_str_from_messages(messages)
|
||||
value_str = str(value)
|
||||
|
||||
# Get TTL and store in Redis semantic cache
|
||||
ttl = self._get_ttl(**kwargs)
|
||||
if ttl is not None:
|
||||
self.llmcache.store(prompt, value_str, ttl=int(ttl))
|
||||
else:
|
||||
self.llmcache.store(prompt, value_str)
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
|
||||
print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
print_verbose(f"Error setting {value_str} in the Redis semantic cache: {str(e)}")
|
||||
|
||||
# get the prompt
|
||||
messages = kwargs["messages"]
|
||||
prompt = "".join(message["content"] for message in messages)
|
||||
# create an embedding for prompt
|
||||
router_model_names = (
|
||||
[m["model_name"] for m in llm_model_list]
|
||||
if llm_model_list is not None
|
||||
else []
|
||||
)
|
||||
if llm_router is not None and self.embedding_model in router_model_names:
|
||||
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
|
||||
embedding_response = await llm_router.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
metadata={
|
||||
"user_api_key": user_api_key,
|
||||
"semantic-cache-embedding": True,
|
||||
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
|
||||
},
|
||||
)
|
||||
else:
|
||||
# convert to embedding
|
||||
embedding_response = await litellm.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
def get_cache(self, key: str, **kwargs) -> Any:
|
||||
"""
|
||||
Retrieve a semantically similar cached response.
|
||||
|
||||
Args:
|
||||
key: The cache key (not directly used in semantic caching)
|
||||
**kwargs: Additional arguments including 'messages' for the prompt
|
||||
|
||||
Returns:
|
||||
The cached response if a semantically similar prompt is found, else None
|
||||
"""
|
||||
print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
try:
|
||||
# Extract the prompt from messages
|
||||
messages = kwargs.get("messages", [])
|
||||
if not messages:
|
||||
print_verbose("No messages provided for semantic cache lookup")
|
||||
return None
|
||||
|
||||
prompt = get_str_from_messages(messages)
|
||||
# Check the cache for semantically similar prompts
|
||||
results = self.llmcache.check(prompt=prompt)
|
||||
|
||||
# make the embedding a numpy array, convert to bytes
|
||||
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
|
||||
value = str(value)
|
||||
assert isinstance(value, str)
|
||||
|
||||
new_data = [
|
||||
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
|
||||
]
|
||||
|
||||
# Add more data
|
||||
await self.index.aload(new_data)
|
||||
return
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
from redisvl.query import VectorQuery
|
||||
|
||||
from litellm.proxy.proxy_server import llm_model_list, llm_router
|
||||
|
||||
# query
|
||||
# get the messages
|
||||
messages = kwargs["messages"]
|
||||
prompt = "".join(message["content"] for message in messages)
|
||||
|
||||
router_model_names = (
|
||||
[m["model_name"] for m in llm_model_list]
|
||||
if llm_model_list is not None
|
||||
else []
|
||||
)
|
||||
if llm_router is not None and self.embedding_model in router_model_names:
|
||||
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
|
||||
embedding_response = await llm_router.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
metadata={
|
||||
"user_api_key": user_api_key,
|
||||
"semantic-cache-embedding": True,
|
||||
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
|
||||
},
|
||||
)
|
||||
else:
|
||||
# convert to embedding
|
||||
embedding_response = await litellm.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
query = VectorQuery(
|
||||
vector=embedding,
|
||||
vector_field_name="litellm_embedding",
|
||||
return_fields=["response", "prompt", "vector_distance"],
|
||||
)
|
||||
results = await self.index.aquery(query)
|
||||
if results is None:
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
return None
|
||||
if isinstance(results, list):
|
||||
if len(results) == 0:
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
# Return None if no similar prompts found
|
||||
if not results:
|
||||
return None
|
||||
|
||||
vector_distance = results[0]["vector_distance"]
|
||||
vector_distance = float(vector_distance)
|
||||
similarity = 1 - vector_distance
|
||||
cached_prompt = results[0]["prompt"]
|
||||
# Process the best matching result
|
||||
cache_hit = results[0]
|
||||
vector_distance = float(cache_hit["vector_distance"])
|
||||
|
||||
# Convert vector distance back to similarity score
|
||||
# For cosine distance: 0 = most similar, 2 = least similar
|
||||
# While similarity: 1 = most similar, 0 = least similar
|
||||
similarity = 1 - vector_distance
|
||||
|
||||
cached_prompt = cache_hit["prompt"]
|
||||
cached_response = cache_hit["response"]
|
||||
|
||||
# check similarity, if more than self.similarity_threshold, return results
|
||||
print_verbose(
|
||||
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
|
||||
)
|
||||
|
||||
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
|
||||
|
||||
if similarity > self.similarity_threshold:
|
||||
# cache hit !
|
||||
cached_value = results[0]["response"]
|
||||
print_verbose(
|
||||
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
|
||||
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
|
||||
f"actual similarity: {similarity}, "
|
||||
f"current prompt: {prompt}, "
|
||||
f"cached prompt: {cached_prompt}"
|
||||
)
|
||||
return self._get_cache_logic(cached_response=cached_value)
|
||||
else:
|
||||
# cache miss !
|
||||
return None
|
||||
pass
|
||||
|
||||
return self._get_cache_logic(cached_response=cached_response)
|
||||
except Exception as e:
|
||||
print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
|
||||
|
||||
async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
|
||||
"""
|
||||
Asynchronously generate an embedding for the given prompt.
|
||||
|
||||
Args:
|
||||
prompt: The text to generate an embedding for
|
||||
**kwargs: Additional arguments that may contain metadata
|
||||
|
||||
Returns:
|
||||
List[float]: The embedding vector
|
||||
"""
|
||||
from litellm.proxy.proxy_server import llm_model_list, llm_router
|
||||
|
||||
async def _index_info(self):
|
||||
return await self.index.ainfo()
|
||||
# Route the embedding request through the proxy if appropriate
|
||||
router_model_names = (
|
||||
[m["model_name"] for m in llm_model_list]
|
||||
if llm_model_list is not None
|
||||
else []
|
||||
)
|
||||
|
||||
try:
|
||||
if llm_router is not None and self.embedding_model in router_model_names:
|
||||
# Use the router for embedding generation
|
||||
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
|
||||
embedding_response = await llm_router.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
metadata={
|
||||
"user_api_key": user_api_key,
|
||||
"semantic-cache-embedding": True,
|
||||
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
|
||||
},
|
||||
)
|
||||
else:
|
||||
# Generate embedding directly
|
||||
embedding_response = await litellm.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
async def async_set_cache_pipeline(self, cache_list, **kwargs):
|
||||
tasks = []
|
||||
for val in cache_list:
|
||||
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
|
||||
await asyncio.gather(*tasks)
|
||||
# Extract and return the embedding vector
|
||||
return embedding_response["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
print_verbose(f"Error generating async embedding: {str(e)}")
|
||||
raise ValueError(f"Failed to generate embedding: {str(e)}") from e
|
||||
|
||||
async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
|
||||
"""
|
||||
Asynchronously store a value in the semantic cache.
|
||||
|
||||
Args:
|
||||
key: The cache key (not directly used in semantic caching)
|
||||
value: The response value to cache
|
||||
**kwargs: Additional arguments including 'messages' for the prompt
|
||||
and optional 'ttl' for time-to-live
|
||||
"""
|
||||
print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
|
||||
try:
|
||||
# Extract the prompt from messages
|
||||
messages = kwargs.get("messages", [])
|
||||
if not messages:
|
||||
print_verbose("No messages provided for semantic caching")
|
||||
return
|
||||
|
||||
prompt = get_str_from_messages(messages)
|
||||
value_str = str(value)
|
||||
|
||||
# Generate embedding for the value (response) to cache
|
||||
prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
|
||||
|
||||
# Get TTL and store in Redis semantic cache
|
||||
ttl = self._get_ttl(**kwargs)
|
||||
if ttl is not None:
|
||||
await self.llmcache.astore(
|
||||
prompt,
|
||||
value_str,
|
||||
vector=prompt_embedding, # Pass through custom embedding
|
||||
ttl=ttl
|
||||
)
|
||||
else:
|
||||
await self.llmcache.astore(
|
||||
prompt,
|
||||
value_str,
|
||||
vector=prompt_embedding # Pass through custom embedding
|
||||
)
|
||||
except Exception as e:
|
||||
print_verbose(f"Error in async_set_cache: {str(e)}")
|
||||
|
||||
async def async_get_cache(self, key: str, **kwargs) -> Any:
|
||||
"""
|
||||
Asynchronously retrieve a semantically similar cached response.
|
||||
|
||||
Args:
|
||||
key: The cache key (not directly used in semantic caching)
|
||||
**kwargs: Additional arguments including 'messages' for the prompt
|
||||
|
||||
Returns:
|
||||
The cached response if a semantically similar prompt is found, else None
|
||||
"""
|
||||
print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
|
||||
try:
|
||||
# Extract the prompt from messages
|
||||
messages = kwargs.get("messages", [])
|
||||
if not messages:
|
||||
print_verbose("No messages provided for semantic cache lookup")
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
return None
|
||||
|
||||
prompt = get_str_from_messages(messages)
|
||||
|
||||
# Generate embedding for the prompt
|
||||
prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
|
||||
|
||||
# Check the cache for semantically similar prompts
|
||||
results = await self.llmcache.acheck(
|
||||
prompt=prompt,
|
||||
vector=prompt_embedding
|
||||
)
|
||||
|
||||
# handle results / cache hit
|
||||
if not results:
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0 # TODO why here but not above??
|
||||
return None
|
||||
|
||||
cache_hit = results[0]
|
||||
vector_distance = float(cache_hit["vector_distance"])
|
||||
|
||||
# Convert vector distance back to similarity
|
||||
# For cosine distance: 0 = most similar, 2 = least similar
|
||||
# While similarity: 1 = most similar, 0 = least similar
|
||||
similarity = 1 - vector_distance
|
||||
|
||||
cached_prompt = cache_hit["prompt"]
|
||||
cached_response = cache_hit["response"]
|
||||
|
||||
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
|
||||
|
||||
print_verbose(
|
||||
f"Cache hit: similarity threshold: {self.similarity_threshold}, "
|
||||
f"actual similarity: {similarity}, "
|
||||
f"current prompt: {prompt}, "
|
||||
f"cached prompt: {cached_prompt}"
|
||||
)
|
||||
|
||||
return self._get_cache_logic(cached_response=cached_response)
|
||||
except Exception as e:
|
||||
print_verbose(f"Error in async_get_cache: {str(e)}")
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
|
||||
async def _index_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get information about the Redis index.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Information about the Redis index
|
||||
"""
|
||||
aindex = await self.llmcache._get_async_index()
|
||||
return await aindex.info()
|
||||
|
||||
async def async_set_cache_pipeline(self, cache_list: List[Tuple[str, Any]], **kwargs) -> None:
|
||||
"""
|
||||
Asynchronously store multiple values in the semantic cache.
|
||||
|
||||
Args:
|
||||
cache_list: List of (key, value) tuples to cache
|
||||
**kwargs: Additional arguments
|
||||
"""
|
||||
try:
|
||||
tasks = []
|
||||
for val in cache_list:
|
||||
tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
|
||||
await asyncio.gather(*tasks)
|
||||
except Exception as e:
|
||||
print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")
|
||||
|
|
|
@ -9,6 +9,9 @@ from pydantic import BaseModel
|
|||
import litellm
|
||||
import litellm._logging
|
||||
from litellm import verbose_logger
|
||||
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
||||
StandardBuiltInToolCostTracking,
|
||||
)
|
||||
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
|
||||
from litellm.llms.anthropic.cost_calculation import (
|
||||
cost_per_token as anthropic_cost_per_token,
|
||||
|
@ -57,6 +60,7 @@ from litellm.types.utils import (
|
|||
LlmProvidersSet,
|
||||
ModelInfo,
|
||||
PassthroughCallTypes,
|
||||
StandardBuiltInToolsParams,
|
||||
Usage,
|
||||
)
|
||||
from litellm.utils import (
|
||||
|
@ -524,6 +528,7 @@ def completion_cost( # noqa: PLR0915
|
|||
optional_params: Optional[dict] = None,
|
||||
custom_pricing: Optional[bool] = None,
|
||||
base_model: Optional[str] = None,
|
||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
|
||||
|
@ -802,6 +807,12 @@ def completion_cost( # noqa: PLR0915
|
|||
rerank_billed_units=rerank_billed_units,
|
||||
)
|
||||
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
_final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
|
||||
model=model,
|
||||
response_object=completion_response,
|
||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
|
||||
return _final_cost
|
||||
except Exception as e:
|
||||
|
@ -861,6 +872,7 @@ def response_cost_calculator(
|
|||
base_model: Optional[str] = None,
|
||||
custom_pricing: Optional[bool] = None,
|
||||
prompt: str = "",
|
||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Returns
|
||||
|
@ -890,6 +902,7 @@ def response_cost_calculator(
|
|||
custom_pricing=custom_pricing,
|
||||
base_model=base_model,
|
||||
prompt=prompt,
|
||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||
)
|
||||
return response_cost
|
||||
except Exception as e:
|
||||
|
|
|
@ -10,13 +10,16 @@ import asyncio
|
|||
import json
|
||||
import os
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.proxy._types import SpendLogsPayload
|
||||
else:
|
||||
SpendLogsPayload = Any
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
|
@ -61,7 +64,7 @@ class GcsPubSubLogger(CustomBatchLogger):
|
|||
self.flush_lock = asyncio.Lock()
|
||||
super().__init__(**kwargs, flush_lock=self.flush_lock)
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
self.log_queue: List[SpendLogsPayload] = []
|
||||
self.log_queue: List[Union[SpendLogsPayload, StandardLoggingPayload]] = []
|
||||
|
||||
async def construct_request_headers(self) -> Dict[str, str]:
|
||||
"""Construct authorization headers using Vertex AI auth"""
|
||||
|
@ -115,13 +118,20 @@ class GcsPubSubLogger(CustomBatchLogger):
|
|||
verbose_logger.debug(
|
||||
"PubSub: Logging - Enters logging function for model %s", kwargs
|
||||
)
|
||||
spend_logs_payload = get_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
self.log_queue.append(spend_logs_payload)
|
||||
standard_logging_payload = kwargs.get("standard_logging_object", None)
|
||||
|
||||
# Backwards compatibility with old logging payload
|
||||
if litellm.gcs_pub_sub_use_v1 is True:
|
||||
spend_logs_payload = get_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
self.log_queue.append(spend_logs_payload)
|
||||
else:
|
||||
# New logging payload, StandardLoggingPayload
|
||||
self.log_queue.append(standard_logging_payload)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
|
@ -155,7 +165,7 @@ class GcsPubSubLogger(CustomBatchLogger):
|
|||
self.log_queue.clear()
|
||||
|
||||
async def publish_message(
|
||||
self, message: SpendLogsPayload
|
||||
self, message: Union[SpendLogsPayload, StandardLoggingPayload]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Publish message to Google Cloud Pub/Sub using REST API
|
||||
|
|
|
@ -35,6 +35,9 @@ from litellm.integrations.custom_logger import CustomLogger
|
|||
from litellm.integrations.mlflow import MlflowLogger
|
||||
from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
|
||||
from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
|
||||
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
||||
StandardBuiltInToolCostTracking,
|
||||
)
|
||||
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
|
||||
from litellm.litellm_core_utils.redact_messages import (
|
||||
redact_message_input_output_from_custom_logger,
|
||||
|
@ -60,6 +63,7 @@ from litellm.types.utils import (
|
|||
ModelResponse,
|
||||
ModelResponseStream,
|
||||
RawRequestTypedDict,
|
||||
StandardBuiltInToolsParams,
|
||||
StandardCallbackDynamicParams,
|
||||
StandardLoggingAdditionalHeaders,
|
||||
StandardLoggingHiddenParams,
|
||||
|
@ -264,7 +268,9 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
self.standard_callback_dynamic_params: StandardCallbackDynamicParams = (
|
||||
self.initialize_standard_callback_dynamic_params(kwargs)
|
||||
)
|
||||
|
||||
self.standard_built_in_tools_params: StandardBuiltInToolsParams = (
|
||||
self.initialize_standard_built_in_tools_params(kwargs)
|
||||
)
|
||||
## TIME TO FIRST TOKEN LOGGING ##
|
||||
self.completion_start_time: Optional[datetime.datetime] = None
|
||||
self._llm_caching_handler: Optional[LLMCachingHandler] = None
|
||||
|
@ -369,6 +375,23 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
"""
|
||||
return _initialize_standard_callback_dynamic_params(kwargs)
|
||||
|
||||
def initialize_standard_built_in_tools_params(
|
||||
self, kwargs: Optional[Dict] = None
|
||||
) -> StandardBuiltInToolsParams:
|
||||
"""
|
||||
Initialize the standard built-in tools params from the kwargs
|
||||
|
||||
checks if web_search_options in kwargs or tools and sets the corresponding attribute in StandardBuiltInToolsParams
|
||||
"""
|
||||
return StandardBuiltInToolsParams(
|
||||
web_search_options=StandardBuiltInToolCostTracking._get_web_search_options(
|
||||
kwargs or {}
|
||||
),
|
||||
file_search=StandardBuiltInToolCostTracking._get_file_search_tool_call(
|
||||
kwargs or {}
|
||||
),
|
||||
)
|
||||
|
||||
def update_environment_variables(
|
||||
self,
|
||||
litellm_params: Dict,
|
||||
|
@ -495,6 +518,16 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
}
|
||||
return data
|
||||
|
||||
def _get_masked_api_base(self, api_base: str) -> str:
|
||||
if "key=" in api_base:
|
||||
# Find the position of "key=" in the string
|
||||
key_index = api_base.find("key=") + 4
|
||||
# Mask the last 5 characters after "key="
|
||||
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
|
||||
else:
|
||||
masked_api_base = api_base
|
||||
return str(masked_api_base)
|
||||
|
||||
def _pre_call(self, input, api_key, model=None, additional_args={}):
|
||||
"""
|
||||
Common helper function across the sync + async pre-call function
|
||||
|
@ -508,6 +541,9 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
model
|
||||
): # if model name was changes pre-call, overwrite the initial model call name with the new one
|
||||
self.model_call_details["model"] = model
|
||||
self.model_call_details["litellm_params"]["api_base"] = (
|
||||
self._get_masked_api_base(additional_args.get("api_base", ""))
|
||||
)
|
||||
|
||||
def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915
|
||||
|
||||
|
@ -691,15 +727,6 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
headers = {}
|
||||
data = additional_args.get("complete_input_dict", {})
|
||||
api_base = str(additional_args.get("api_base", ""))
|
||||
if "key=" in api_base:
|
||||
# Find the position of "key=" in the string
|
||||
key_index = api_base.find("key=") + 4
|
||||
# Mask the last 5 characters after "key="
|
||||
masked_api_base = api_base[:key_index] + "*" * 5 + api_base[-4:]
|
||||
else:
|
||||
masked_api_base = api_base
|
||||
self.model_call_details["litellm_params"]["api_base"] = masked_api_base
|
||||
|
||||
curl_command = self._get_request_curl_command(
|
||||
api_base=api_base,
|
||||
headers=headers,
|
||||
|
@ -714,11 +741,12 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
def _get_request_curl_command(
|
||||
self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
|
||||
) -> str:
|
||||
masked_api_base = self._get_masked_api_base(api_base)
|
||||
if headers is None:
|
||||
headers = {}
|
||||
curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
|
||||
curl_command += "curl -X POST \\\n"
|
||||
curl_command += f"{api_base} \\\n"
|
||||
curl_command += f"{masked_api_base} \\\n"
|
||||
masked_headers = self._get_masked_headers(headers)
|
||||
formatted_headers = " ".join(
|
||||
[f"-H '{k}: {v}'" for k, v in masked_headers.items()]
|
||||
|
@ -903,6 +931,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
"optional_params": self.optional_params,
|
||||
"custom_pricing": custom_pricing,
|
||||
"prompt": prompt,
|
||||
"standard_built_in_tools_params": self.standard_built_in_tools_params,
|
||||
}
|
||||
except Exception as e: # error creating kwargs for cost calculation
|
||||
debug_info = StandardLoggingModelCostFailureDebugInformation(
|
||||
|
@ -1067,6 +1096,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
end_time=end_time,
|
||||
logging_obj=self,
|
||||
status="success",
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
)
|
||||
elif isinstance(result, dict): # pass-through endpoints
|
||||
|
@ -1079,6 +1109,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
end_time=end_time,
|
||||
logging_obj=self,
|
||||
status="success",
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
)
|
||||
elif standard_logging_object is not None:
|
||||
|
@ -1102,6 +1133,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
prompt="",
|
||||
completion=getattr(result, "content", ""),
|
||||
total_time=float_diff,
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
|
||||
return start_time, end_time, result
|
||||
|
@ -1155,6 +1187,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
end_time=end_time,
|
||||
logging_obj=self,
|
||||
status="success",
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
)
|
||||
callbacks = self.get_combined_callback_list(
|
||||
|
@ -1695,6 +1728,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
end_time=end_time,
|
||||
logging_obj=self,
|
||||
status="success",
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
)
|
||||
callbacks = self.get_combined_callback_list(
|
||||
|
@ -1911,6 +1945,7 @@ class Logging(LiteLLMLoggingBaseClass):
|
|||
status="failure",
|
||||
error_str=str(exception),
|
||||
original_exception=exception,
|
||||
standard_built_in_tools_params=self.standard_built_in_tools_params,
|
||||
)
|
||||
)
|
||||
return start_time, end_time
|
||||
|
@ -3367,6 +3402,7 @@ def get_standard_logging_object_payload(
|
|||
status: StandardLoggingPayloadStatus,
|
||||
error_str: Optional[str] = None,
|
||||
original_exception: Optional[Exception] = None,
|
||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||
) -> Optional[StandardLoggingPayload]:
|
||||
try:
|
||||
kwargs = kwargs or {}
|
||||
|
@ -3542,6 +3578,7 @@ def get_standard_logging_object_payload(
|
|||
guardrail_information=metadata.get(
|
||||
"standard_logging_guardrail_information", None
|
||||
),
|
||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||
)
|
||||
|
||||
emit_standard_logging_payload(payload)
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
"""
|
||||
Helper utilities for tracking the cost of built-in tools.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import litellm
|
||||
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
||||
from litellm.types.utils import (
|
||||
ModelInfo,
|
||||
ModelResponse,
|
||||
SearchContextCostPerQuery,
|
||||
StandardBuiltInToolsParams,
|
||||
)
|
||||
|
||||
|
||||
class StandardBuiltInToolCostTracking:
|
||||
"""
|
||||
Helper class for tracking the cost of built-in tools
|
||||
|
||||
Example: Web Search
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_cost_for_built_in_tools(
|
||||
model: str,
|
||||
response_object: Any,
|
||||
custom_llm_provider: Optional[str] = None,
|
||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Get the cost of using built-in tools.
|
||||
|
||||
Supported tools:
|
||||
- Web Search
|
||||
|
||||
"""
|
||||
if standard_built_in_tools_params is not None:
|
||||
if (
|
||||
standard_built_in_tools_params.get("web_search_options", None)
|
||||
is not None
|
||||
):
|
||||
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
|
||||
return StandardBuiltInToolCostTracking.get_cost_for_web_search(
|
||||
web_search_options=standard_built_in_tools_params.get(
|
||||
"web_search_options", None
|
||||
),
|
||||
model_info=model_info,
|
||||
)
|
||||
|
||||
if standard_built_in_tools_params.get("file_search", None) is not None:
|
||||
return StandardBuiltInToolCostTracking.get_cost_for_file_search(
|
||||
file_search=standard_built_in_tools_params.get("file_search", None),
|
||||
)
|
||||
|
||||
if isinstance(response_object, ModelResponse):
|
||||
if StandardBuiltInToolCostTracking.chat_completion_response_includes_annotations(
|
||||
response_object
|
||||
):
|
||||
model_info = StandardBuiltInToolCostTracking._safe_get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
|
||||
model_info
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def _safe_get_model_info(
|
||||
model: str, custom_llm_provider: Optional[str] = None
|
||||
) -> Optional[ModelInfo]:
|
||||
try:
|
||||
return litellm.get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_cost_for_web_search(
|
||||
web_search_options: Optional[WebSearchOptions] = None,
|
||||
model_info: Optional[ModelInfo] = None,
|
||||
) -> float:
|
||||
"""
|
||||
If request includes `web_search_options`, calculate the cost of the web search.
|
||||
"""
|
||||
if web_search_options is None:
|
||||
return 0.0
|
||||
if model_info is None:
|
||||
return 0.0
|
||||
|
||||
search_context_pricing: SearchContextCostPerQuery = (
|
||||
model_info.get("search_context_cost_per_query", {}) or {}
|
||||
)
|
||||
if web_search_options.get("search_context_size", None) == "low":
|
||||
return search_context_pricing.get("search_context_size_low", 0.0)
|
||||
elif web_search_options.get("search_context_size", None) == "medium":
|
||||
return search_context_pricing.get("search_context_size_medium", 0.0)
|
||||
elif web_search_options.get("search_context_size", None) == "high":
|
||||
return search_context_pricing.get("search_context_size_high", 0.0)
|
||||
return StandardBuiltInToolCostTracking.get_default_cost_for_web_search(
|
||||
model_info
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_default_cost_for_web_search(
|
||||
model_info: Optional[ModelInfo] = None,
|
||||
) -> float:
|
||||
"""
|
||||
If no web search options are provided, use the `search_context_size_medium` pricing.
|
||||
|
||||
https://platform.openai.com/docs/pricing#web-search
|
||||
"""
|
||||
if model_info is None:
|
||||
return 0.0
|
||||
search_context_pricing: SearchContextCostPerQuery = (
|
||||
model_info.get("search_context_cost_per_query", {}) or {}
|
||||
) or {}
|
||||
return search_context_pricing.get("search_context_size_medium", 0.0)
|
||||
|
||||
@staticmethod
|
||||
def get_cost_for_file_search(
|
||||
file_search: Optional[FileSearchTool] = None,
|
||||
) -> float:
|
||||
""" "
|
||||
Charged at $2.50/1k calls
|
||||
|
||||
Doc: https://platform.openai.com/docs/pricing#built-in-tools
|
||||
"""
|
||||
if file_search is None:
|
||||
return 0.0
|
||||
return 2.5 / 1000
|
||||
|
||||
@staticmethod
|
||||
def chat_completion_response_includes_annotations(
|
||||
response_object: ModelResponse,
|
||||
) -> bool:
|
||||
for _choice in response_object.choices:
|
||||
message = getattr(_choice, "message", None)
|
||||
if (
|
||||
message is not None
|
||||
and hasattr(message, "annotations")
|
||||
and message.annotations is not None
|
||||
and len(message.annotations) > 0
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _get_web_search_options(kwargs: Dict) -> Optional[WebSearchOptions]:
|
||||
if "web_search_options" in kwargs:
|
||||
return WebSearchOptions(**kwargs.get("web_search_options", {}))
|
||||
|
||||
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
|
||||
kwargs, "web_search_preview"
|
||||
)
|
||||
if tools:
|
||||
# Look for web search tool in the tools array
|
||||
for tool in tools:
|
||||
if isinstance(tool, dict):
|
||||
if StandardBuiltInToolCostTracking._is_web_search_tool_call(tool):
|
||||
return WebSearchOptions(**tool)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_tools_from_kwargs(kwargs: Dict, tool_type: str) -> Optional[List[Dict]]:
|
||||
if "tools" in kwargs:
|
||||
tools = kwargs.get("tools", [])
|
||||
return tools
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_file_search_tool_call(kwargs: Dict) -> Optional[FileSearchTool]:
|
||||
tools = StandardBuiltInToolCostTracking._get_tools_from_kwargs(
|
||||
kwargs, "file_search"
|
||||
)
|
||||
if tools:
|
||||
for tool in tools:
|
||||
if isinstance(tool, dict):
|
||||
if StandardBuiltInToolCostTracking._is_file_search_tool_call(tool):
|
||||
return FileSearchTool(**tool)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _is_web_search_tool_call(tool: Dict) -> bool:
|
||||
if tool.get("type", None) == "web_search_preview":
|
||||
return True
|
||||
if "search_context_size" in tool:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_file_search_tool_call(tool: Dict) -> bool:
|
||||
if tool.get("type", None) == "file_search":
|
||||
return True
|
||||
return False
|
|
@ -138,13 +138,22 @@ class ModelParamHelper:
|
|||
TranscriptionCreateParamsNonStreaming,
|
||||
TranscriptionCreateParamsStreaming,
|
||||
)
|
||||
non_streaming_kwargs = set(getattr(TranscriptionCreateParamsNonStreaming, "__annotations__", {}).keys())
|
||||
streaming_kwargs = set(getattr(TranscriptionCreateParamsStreaming, "__annotations__", {}).keys())
|
||||
|
||||
non_streaming_kwargs = set(
|
||||
getattr(
|
||||
TranscriptionCreateParamsNonStreaming, "__annotations__", {}
|
||||
).keys()
|
||||
)
|
||||
streaming_kwargs = set(
|
||||
getattr(
|
||||
TranscriptionCreateParamsStreaming, "__annotations__", {}
|
||||
).keys()
|
||||
)
|
||||
|
||||
all_transcription_kwargs = non_streaming_kwargs.union(streaming_kwargs)
|
||||
return all_transcription_kwargs
|
||||
except Exception as e:
|
||||
verbose_logger.warning("Error getting transcription kwargs %s", str(e))
|
||||
verbose_logger.debug("Error getting transcription kwargs %s", str(e))
|
||||
return set()
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -5304,6 +5304,17 @@
|
|||
"mode": "embedding",
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
|
||||
},
|
||||
"text-embedding-large-exp-03-07": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"output_vector_size": 3072,
|
||||
"input_cost_per_character": 0.000000025,
|
||||
"input_cost_per_token": 0.0000001,
|
||||
"output_cost_per_token": 0,
|
||||
"litellm_provider": "vertex_ai-embedding-models",
|
||||
"mode": "embedding",
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
|
||||
},
|
||||
"textembedding-gecko": {
|
||||
"max_tokens": 3072,
|
||||
"max_input_tokens": 3072,
|
||||
|
|
|
@ -5,7 +5,10 @@ model_list:
|
|||
api_key: os.environ/AZURE_API_KEY
|
||||
api_base: http://0.0.0.0:8090
|
||||
rpm: 3
|
||||
|
||||
- model_name: "gpt-4o-mini-openai"
|
||||
litellm_params:
|
||||
model: gpt-4o-mini
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
litellm_settings:
|
||||
num_retries: 0
|
||||
|
||||
|
|
|
@ -542,13 +542,10 @@ async def vertex_proxy_route(
|
|||
user_api_key_dict,
|
||||
stream=is_streaming_request, # type: ignore
|
||||
)
|
||||
except Exception as e:
|
||||
except ProxyException as e:
|
||||
if headers_passed_through:
|
||||
raise Exception(
|
||||
f"No credentials found on proxy for this request. Headers were passed through directly but request failed with error: {str(e)}"
|
||||
)
|
||||
else:
|
||||
raise e
|
||||
e.message = f"No credentials found on proxy for project_name={vertex_project} + location={vertex_location}, check `/model/info` for allowed project + region combinations with `use_in_pass_through: true`. Headers were passed through directly but request failed with error: {e.message}"
|
||||
raise e
|
||||
|
||||
return received_value
|
||||
|
||||
|
|
|
@ -1788,9 +1788,6 @@ class ProxyConfig:
|
|||
reset_color_code,
|
||||
cache_password,
|
||||
)
|
||||
if cache_type == "redis-semantic":
|
||||
# by default this should always be async
|
||||
cache_params.update({"redis_semantic_cache_use_async": True})
|
||||
|
||||
# users can pass os.environ/ variables on the proxy - we should read them from the env
|
||||
for key, value in cache_params.items():
|
||||
|
@ -6181,18 +6178,18 @@ async def model_info_v1( # noqa: PLR0915
|
|||
)
|
||||
|
||||
if len(all_models_str) > 0:
|
||||
model_names = all_models_str
|
||||
llm_model_list = llm_router.get_model_list()
|
||||
_relevant_models = []
|
||||
for model in all_models_str:
|
||||
router_models = llm_router.get_model_list(model_name=model)
|
||||
if router_models is not None:
|
||||
_relevant_models.extend(router_models)
|
||||
if llm_model_list is not None:
|
||||
_relevant_models = [
|
||||
m for m in llm_model_list if m["model_name"] in model_names
|
||||
]
|
||||
all_models = copy.deepcopy(_relevant_models) # type: ignore
|
||||
else:
|
||||
all_models = []
|
||||
|
||||
for model in all_models:
|
||||
model = _get_proxy_model_info(model=model)
|
||||
for in_place_model in all_models:
|
||||
in_place_model = _get_proxy_model_info(model=in_place_model)
|
||||
|
||||
verbose_proxy_logger.debug("all_models: %s", all_models)
|
||||
return {"data": all_models}
|
||||
|
|
|
@ -4924,6 +4924,11 @@ class Router:
|
|||
and model_info["supports_function_calling"] is True # type: ignore
|
||||
):
|
||||
model_group_info.supports_function_calling = True
|
||||
if (
|
||||
model_info.get("supports_web_search", None) is not None
|
||||
and model_info["supports_web_search"] is True # type: ignore
|
||||
):
|
||||
model_group_info.supports_web_search = True
|
||||
if (
|
||||
model_info.get("supported_openai_params", None) is not None
|
||||
and model_info["supported_openai_params"] is not None
|
||||
|
@ -5286,10 +5291,11 @@ class Router:
|
|||
|
||||
if len(returned_models) == 0: # check if wildcard route
|
||||
potential_wildcard_models = self.pattern_router.route(model_name)
|
||||
if potential_wildcard_models is not None:
|
||||
returned_models.extend(
|
||||
[DeploymentTypedDict(**m) for m in potential_wildcard_models] # type: ignore
|
||||
)
|
||||
if model_name is not None and potential_wildcard_models is not None:
|
||||
for m in potential_wildcard_models:
|
||||
deployment_typed_dict = DeploymentTypedDict(**m) # type: ignore
|
||||
deployment_typed_dict["model_name"] = model_name
|
||||
returned_models.append(deployment_typed_dict)
|
||||
|
||||
if model_name is None:
|
||||
returned_models += self.model_list
|
||||
|
|
|
@ -382,6 +382,53 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
|
|||
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
|
||||
|
||||
|
||||
class WebSearchOptionsUserLocationApproximate(TypedDict, total=False):
|
||||
city: str
|
||||
"""Free text input for the city of the user, e.g. `San Francisco`."""
|
||||
|
||||
country: str
|
||||
"""
|
||||
The two-letter [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of
|
||||
the user, e.g. `US`.
|
||||
"""
|
||||
|
||||
region: str
|
||||
"""Free text input for the region of the user, e.g. `California`."""
|
||||
|
||||
timezone: str
|
||||
"""
|
||||
The [IANA timezone](https://timeapi.io/documentation/iana-timezones) of the
|
||||
user, e.g. `America/Los_Angeles`.
|
||||
"""
|
||||
|
||||
|
||||
class WebSearchOptionsUserLocation(TypedDict, total=False):
|
||||
approximate: Required[WebSearchOptionsUserLocationApproximate]
|
||||
"""Approximate location parameters for the search."""
|
||||
|
||||
type: Required[Literal["approximate"]]
|
||||
"""The type of location approximation. Always `approximate`."""
|
||||
|
||||
|
||||
class WebSearchOptions(TypedDict, total=False):
|
||||
search_context_size: Literal["low", "medium", "high"]
|
||||
"""
|
||||
High level guidance for the amount of context window space to use for the
|
||||
search. One of `low`, `medium`, or `high`. `medium` is the default.
|
||||
"""
|
||||
|
||||
user_location: Optional[WebSearchOptionsUserLocation]
|
||||
"""Approximate location parameters for the search."""
|
||||
|
||||
|
||||
class FileSearchTool(TypedDict, total=False):
|
||||
type: Literal["file_search"]
|
||||
"""The type of tool being defined: `file_search`"""
|
||||
|
||||
vector_store_ids: Optional[List[str]]
|
||||
"""The IDs of the vector stores to search."""
|
||||
|
||||
|
||||
class ChatCompletionAnnotationURLCitation(TypedDict, total=False):
|
||||
end_index: int
|
||||
"""The index of the last character of the URL citation in the message."""
|
||||
|
|
|
@ -559,6 +559,7 @@ class ModelGroupInfo(BaseModel):
|
|||
rpm: Optional[int] = None
|
||||
supports_parallel_function_calling: bool = Field(default=False)
|
||||
supports_vision: bool = Field(default=False)
|
||||
supports_web_search: bool = Field(default=False)
|
||||
supports_function_calling: bool = Field(default=False)
|
||||
supported_openai_params: Optional[List[str]] = Field(default=[])
|
||||
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
|
||||
|
|
|
@ -32,7 +32,9 @@ from .llms.openai import (
|
|||
ChatCompletionThinkingBlock,
|
||||
ChatCompletionToolCallChunk,
|
||||
ChatCompletionUsageBlock,
|
||||
FileSearchTool,
|
||||
OpenAIChatCompletionChunk,
|
||||
WebSearchOptions,
|
||||
)
|
||||
from .rerank import RerankResponse
|
||||
|
||||
|
@ -97,6 +99,13 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
|
|||
supports_pdf_input: Optional[bool]
|
||||
supports_native_streaming: Optional[bool]
|
||||
supports_parallel_function_calling: Optional[bool]
|
||||
supports_web_search: Optional[bool]
|
||||
|
||||
|
||||
class SearchContextCostPerQuery(TypedDict, total=False):
|
||||
search_context_size_low: float
|
||||
search_context_size_medium: float
|
||||
search_context_size_high: float
|
||||
|
||||
|
||||
class ModelInfoBase(ProviderSpecificModelInfo, total=False):
|
||||
|
@ -135,6 +144,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
|
|||
output_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
||||
output_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
||||
output_cost_per_second: Optional[float] # for OpenAI Speech models
|
||||
search_context_cost_per_query: Optional[
|
||||
SearchContextCostPerQuery
|
||||
] # Cost for using web search tool
|
||||
|
||||
litellm_provider: Required[str]
|
||||
mode: Required[
|
||||
|
@ -586,6 +598,11 @@ class Message(OpenAIObject):
|
|||
# OpenAI compatible APIs like mistral API will raise an error if audio is passed in
|
||||
del self.audio
|
||||
|
||||
if annotations is None:
|
||||
# ensure default response matches OpenAI spec
|
||||
# Some OpenAI compatible APIs raise an error if annotations are passed in
|
||||
del self.annotations
|
||||
|
||||
if reasoning_content is None:
|
||||
# ensure default response matches OpenAI spec
|
||||
del self.reasoning_content
|
||||
|
@ -1612,6 +1629,19 @@ class StandardLoggingUserAPIKeyMetadata(TypedDict):
|
|||
user_api_key_end_user_id: Optional[str]
|
||||
|
||||
|
||||
class StandardBuiltInToolsParams(TypedDict, total=False):
|
||||
"""
|
||||
Standard built-in OpenAItools parameters
|
||||
|
||||
This is used to calculate the cost of built-in tools, insert any standard built-in tools parameters here
|
||||
|
||||
OpenAI charges users based on the `web_search_options` parameter
|
||||
"""
|
||||
|
||||
web_search_options: Optional[WebSearchOptions]
|
||||
file_search: Optional[FileSearchTool]
|
||||
|
||||
|
||||
class StandardLoggingPromptManagementMetadata(TypedDict):
|
||||
prompt_id: str
|
||||
prompt_variables: Optional[dict]
|
||||
|
@ -1729,6 +1759,7 @@ class StandardLoggingPayload(TypedDict):
|
|||
model_parameters: dict
|
||||
hidden_params: StandardLoggingHiddenParams
|
||||
guardrail_information: Optional[StandardLoggingGuardrailInformation]
|
||||
standard_built_in_tools_params: Optional[StandardBuiltInToolsParams]
|
||||
|
||||
|
||||
from typing import AsyncIterator, Iterator
|
||||
|
|
|
@ -1975,7 +1975,7 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
|
|||
)
|
||||
|
||||
|
||||
def supports_web_search(model: str, custom_llm_provider: Optional[str]) -> bool:
|
||||
def supports_web_search(model: str, custom_llm_provider: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Check if the given model supports web search and return a boolean value.
|
||||
|
||||
|
@ -4544,6 +4544,10 @@ def _get_model_info_helper( # noqa: PLR0915
|
|||
supports_native_streaming=_model_info.get(
|
||||
"supports_native_streaming", None
|
||||
),
|
||||
supports_web_search=_model_info.get("supports_web_search", False),
|
||||
search_context_cost_per_query=_model_info.get(
|
||||
"search_context_cost_per_query", None
|
||||
),
|
||||
tpm=_model_info.get("tpm", None),
|
||||
rpm=_model_info.get("rpm", None),
|
||||
)
|
||||
|
@ -4612,6 +4616,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
supports_audio_input: Optional[bool]
|
||||
supports_audio_output: Optional[bool]
|
||||
supports_pdf_input: Optional[bool]
|
||||
supports_web_search: Optional[bool]
|
||||
Raises:
|
||||
Exception: If the model is not mapped yet.
|
||||
|
||||
|
|
|
@ -5304,6 +5304,17 @@
|
|||
"mode": "embedding",
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
|
||||
},
|
||||
"text-embedding-large-exp-03-07": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"output_vector_size": 3072,
|
||||
"input_cost_per_character": 0.000000025,
|
||||
"input_cost_per_token": 0.0000001,
|
||||
"output_cost_per_token": 0,
|
||||
"litellm_provider": "vertex_ai-embedding-models",
|
||||
"mode": "embedding",
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
|
||||
},
|
||||
"textembedding-gecko": {
|
||||
"max_tokens": 3072,
|
||||
"max_input_tokens": 3072,
|
||||
|
|
78
poetry.lock
generated
78
poetry.lock
generated
|
@ -810,15 +810,15 @@ test = ["pytest (>=6)"]
|
|||
|
||||
[[package]]
|
||||
name = "fastapi"
|
||||
version = "0.115.11"
|
||||
version = "0.115.12"
|
||||
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"proxy\""
|
||||
files = [
|
||||
{file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
|
||||
{file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
|
||||
{file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"},
|
||||
{file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -1445,14 +1445,14 @@ type = ["pytest-mypy"]
|
|||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
version = "2.1.0"
|
||||
description = "brain-dead simple config-ini parsing"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
|
||||
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
|
||||
{file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
|
||||
{file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2137,14 +2137,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
|
|||
|
||||
[[package]]
|
||||
name = "openai"
|
||||
version = "1.66.3"
|
||||
version = "1.68.2"
|
||||
description = "The official Python library for the openai API"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9"},
|
||||
{file = "openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9"},
|
||||
{file = "openai-1.68.2-py3-none-any.whl", hash = "sha256:24484cb5c9a33b58576fdc5acf0e5f92603024a4e39d0b99793dfa1eb14c2b36"},
|
||||
{file = "openai-1.68.2.tar.gz", hash = "sha256:b720f0a95a1dbe1429c0d9bb62096a0d98057bcda82516f6e8af10284bdd5b19"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2160,6 +2160,7 @@ typing-extensions = ">=4.11,<5"
|
|||
[package.extras]
|
||||
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
|
||||
realtime = ["websockets (>=13,<15)"]
|
||||
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "orjson"
|
||||
|
@ -2477,24 +2478,24 @@ testing = ["google-api-core (>=1.31.5)"]
|
|||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "5.29.3"
|
||||
version = "5.29.4"
|
||||
description = ""
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"extra-proxy\""
|
||||
files = [
|
||||
{file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"},
|
||||
{file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"},
|
||||
{file = "protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e"},
|
||||
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84"},
|
||||
{file = "protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f"},
|
||||
{file = "protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252"},
|
||||
{file = "protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107"},
|
||||
{file = "protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7"},
|
||||
{file = "protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da"},
|
||||
{file = "protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f"},
|
||||
{file = "protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620"},
|
||||
{file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
|
||||
{file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
|
||||
{file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
|
||||
{file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
|
||||
{file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
|
||||
{file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
|
||||
{file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
|
||||
{file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
|
||||
{file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
|
||||
{file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
|
||||
{file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2809,6 +2810,25 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
|||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-asyncio"
|
||||
version = "0.21.2"
|
||||
description = "Pytest support for asyncio"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
|
||||
{file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=7.0.0"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
|
||||
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-mock"
|
||||
version = "3.14.0"
|
||||
|
@ -3279,15 +3299,15 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "rq"
|
||||
version = "2.1.0"
|
||||
version = "2.2.0"
|
||||
description = "RQ is a simple, lightweight, library for creating background jobs, and processing them."
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"proxy\""
|
||||
files = [
|
||||
{file = "rq-2.1.0-py3-none-any.whl", hash = "sha256:3c6892c6ca848e5fb47c1875399a66f13656bf0e123bf725d9aa9a12718e2fdf"},
|
||||
{file = "rq-2.1.0.tar.gz", hash = "sha256:764585b6cab69ef1412f4aee523347e5aa7ece3ca175c118b1d92223dd8c2826"},
|
||||
{file = "rq-2.2.0-py3-none-any.whl", hash = "sha256:dacbfe1ccb79a45c8cd95dec7951620679fa0195570b63da3f9347622d33accc"},
|
||||
{file = "rq-2.2.0.tar.gz", hash = "sha256:b636760f1e4c183022031c142faa0483e687885824e9732ba2953f994104e203"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -3606,15 +3626,15 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2025.1"
|
||||
version = "2025.2"
|
||||
description = "Provider of IANA time zone data"
|
||||
optional = true
|
||||
python-versions = ">=2"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"proxy\" and platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
|
||||
{file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
|
||||
{file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
|
||||
{file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3985,4 +4005,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.8.1,<4.0, !=3.9.7"
|
||||
content-hash = "f7c21b3d659e4a15cd46bb42fb905ad039028f4f6b82507fd1278ac05c412569"
|
||||
content-hash = "9c863b11189227a035a9130c8872de44fe7c5e1e32b47569a56af86e3f6570c5"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.63.14"
|
||||
version = "1.64.0"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -98,13 +98,14 @@ black = "^23.12.0"
|
|||
mypy = "^1.0"
|
||||
pytest = "^7.4.3"
|
||||
pytest-mock = "^3.12.0"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core", "wheel"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.63.14"
|
||||
version = "1.64.0"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
|
@ -9,8 +9,8 @@ uvicorn==0.29.0 # server dep
|
|||
gunicorn==23.0.0 # server dep
|
||||
uvloop==0.21.0 # uvicorn dep, gives us much better performance under load
|
||||
boto3==1.34.34 # aws bedrock/sagemaker calls
|
||||
redis==5.0.0 # caching
|
||||
numpy==2.1.1 # semantic caching
|
||||
redis==5.2.1 # redis caching
|
||||
redisvl==0.4.1 # semantic caching
|
||||
prisma==0.11.0 # for db
|
||||
mangum==0.17.0 # for aws lambda functions
|
||||
pynacl==1.5.0 # for encrypting keys
|
||||
|
|
|
@ -1,13 +1,8 @@
|
|||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import respx
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
sys.path.insert(
|
||||
|
@ -18,9 +13,18 @@ from unittest.mock import AsyncMock
|
|||
from litellm.caching.redis_cache import RedisCache
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def redis_no_ping():
|
||||
"""Patch RedisCache initialization to prevent async ping tasks from being created"""
|
||||
with patch('asyncio.get_running_loop') as mock_get_loop:
|
||||
# Either raise an exception or return a mock that will handle the task creation
|
||||
mock_get_loop.side_effect = RuntimeError("No running event loop")
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.parametrize("namespace", [None, "test"])
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_cache_async_increment(namespace, monkeypatch):
|
||||
async def test_redis_cache_async_increment(namespace, monkeypatch, redis_no_ping):
|
||||
monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
|
||||
redis_cache = RedisCache(namespace=namespace)
|
||||
# Create an AsyncMock for the Redis client
|
||||
|
@ -47,10 +51,46 @@ async def test_redis_cache_async_increment(namespace, monkeypatch):
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_client_init_with_socket_timeout(monkeypatch):
|
||||
async def test_redis_client_init_with_socket_timeout(monkeypatch, redis_no_ping):
|
||||
monkeypatch.setenv("REDIS_HOST", "my-fake-host")
|
||||
redis_cache = RedisCache(socket_timeout=1.0)
|
||||
assert redis_cache.redis_kwargs["socket_timeout"] == 1.0
|
||||
client = redis_cache.init_async_client()
|
||||
assert client is not None
|
||||
assert client.connection_pool.connection_kwargs["socket_timeout"] == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_cache_async_batch_get_cache(monkeypatch, redis_no_ping):
|
||||
monkeypatch.setenv("REDIS_HOST", "https://my-test-host")
|
||||
redis_cache = RedisCache()
|
||||
|
||||
# Create an AsyncMock for the Redis client
|
||||
mock_redis_instance = AsyncMock()
|
||||
|
||||
# Make sure the mock can be used as an async context manager
|
||||
mock_redis_instance.__aenter__.return_value = mock_redis_instance
|
||||
mock_redis_instance.__aexit__.return_value = None
|
||||
|
||||
# Setup the return value for mget
|
||||
mock_redis_instance.mget.return_value = [
|
||||
b'{"key1": "value1"}',
|
||||
None,
|
||||
b'{"key3": "value3"}'
|
||||
]
|
||||
|
||||
test_keys = ["key1", "key2", "key3"]
|
||||
|
||||
with patch.object(
|
||||
redis_cache, "init_async_client", return_value=mock_redis_instance
|
||||
):
|
||||
# Call async_batch_get_cache
|
||||
result = await redis_cache.async_batch_get_cache(key_list=test_keys)
|
||||
|
||||
# Verify mget was called with the correct keys
|
||||
mock_redis_instance.mget.assert_called_once()
|
||||
|
||||
# Check that results were properly decoded
|
||||
assert result["key1"] == {"key1": "value1"}
|
||||
assert result["key2"] is None
|
||||
assert result["key3"] == {"key3": "value3"}
|
||||
|
|
130
tests/litellm/caching/test_redis_semantic_cache.py
Normal file
130
tests/litellm/caching/test_redis_semantic_cache.py
Normal file
|
@ -0,0 +1,130 @@
|
|||
import os
|
||||
import sys
|
||||
from unittest.mock import MagicMock, patch, AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
|
||||
# Tests for RedisSemanticCache
|
||||
def test_redis_semantic_cache_initialization(monkeypatch):
|
||||
# Mock the redisvl import
|
||||
semantic_cache_mock = MagicMock()
|
||||
with patch.dict("sys.modules", {
|
||||
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
|
||||
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=MagicMock())
|
||||
}):
|
||||
from litellm.caching.redis_semantic_cache import RedisSemanticCache
|
||||
|
||||
# Set environment variables
|
||||
monkeypatch.setenv("REDIS_HOST", "localhost")
|
||||
monkeypatch.setenv("REDIS_PORT", "6379")
|
||||
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
|
||||
|
||||
# Initialize the cache with a similarity threshold
|
||||
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
|
||||
|
||||
# Verify the semantic cache was initialized with correct parameters
|
||||
assert redis_semantic_cache.similarity_threshold == 0.8
|
||||
|
||||
# Use pytest.approx for floating point comparison to handle precision issues
|
||||
assert redis_semantic_cache.distance_threshold == pytest.approx(0.2, abs=1e-10)
|
||||
assert redis_semantic_cache.embedding_model == "text-embedding-ada-002"
|
||||
|
||||
# Test initialization with missing similarity_threshold
|
||||
with pytest.raises(ValueError, match="similarity_threshold must be provided"):
|
||||
RedisSemanticCache()
|
||||
|
||||
|
||||
def test_redis_semantic_cache_get_cache(monkeypatch):
|
||||
# Mock the redisvl import and embedding function
|
||||
semantic_cache_mock = MagicMock()
|
||||
custom_vectorizer_mock = MagicMock()
|
||||
|
||||
with patch.dict("sys.modules", {
|
||||
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
|
||||
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
|
||||
}):
|
||||
from litellm.caching.redis_semantic_cache import RedisSemanticCache
|
||||
|
||||
# Set environment variables
|
||||
monkeypatch.setenv("REDIS_HOST", "localhost")
|
||||
monkeypatch.setenv("REDIS_PORT", "6379")
|
||||
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
|
||||
|
||||
# Initialize cache
|
||||
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
|
||||
|
||||
# Mock the llmcache.check method to return a result
|
||||
mock_result = [
|
||||
{
|
||||
"prompt": "What is the capital of France?",
|
||||
"response": '{"content": "Paris is the capital of France."}',
|
||||
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
|
||||
}
|
||||
]
|
||||
redis_semantic_cache.llmcache.check = MagicMock(return_value=mock_result)
|
||||
|
||||
# Mock the embedding function
|
||||
with patch("litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}):
|
||||
# Test get_cache with a message
|
||||
result = redis_semantic_cache.get_cache(
|
||||
key="test_key",
|
||||
messages=[{"content": "What is the capital of France?"}]
|
||||
)
|
||||
|
||||
# Verify result is properly parsed
|
||||
assert result == {"content": "Paris is the capital of France."}
|
||||
|
||||
# Verify llmcache.check was called
|
||||
redis_semantic_cache.llmcache.check.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_semantic_cache_async_get_cache(monkeypatch):
|
||||
# Mock the redisvl import
|
||||
semantic_cache_mock = MagicMock()
|
||||
custom_vectorizer_mock = MagicMock()
|
||||
|
||||
with patch.dict("sys.modules", {
|
||||
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
|
||||
"redisvl.utils.vectorize": MagicMock(CustomTextVectorizer=custom_vectorizer_mock)
|
||||
}):
|
||||
from litellm.caching.redis_semantic_cache import RedisSemanticCache
|
||||
|
||||
# Set environment variables
|
||||
monkeypatch.setenv("REDIS_HOST", "localhost")
|
||||
monkeypatch.setenv("REDIS_PORT", "6379")
|
||||
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
|
||||
|
||||
# Initialize cache
|
||||
redis_semantic_cache = RedisSemanticCache(similarity_threshold=0.8)
|
||||
|
||||
# Mock the async methods
|
||||
mock_result = [
|
||||
{
|
||||
"prompt": "What is the capital of France?",
|
||||
"response": '{"content": "Paris is the capital of France."}',
|
||||
"vector_distance": 0.1 # Distance of 0.1 means similarity of 0.9
|
||||
}
|
||||
]
|
||||
|
||||
redis_semantic_cache.llmcache.acheck = AsyncMock(return_value=mock_result)
|
||||
redis_semantic_cache._get_async_embedding = AsyncMock(return_value=[0.1, 0.2, 0.3])
|
||||
|
||||
# Test async_get_cache with a message
|
||||
result = await redis_semantic_cache.async_get_cache(
|
||||
key="test_key",
|
||||
messages=[{"content": "What is the capital of France?"}],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
# Verify result is properly parsed
|
||||
assert result == {"content": "Paris is the capital of France."}
|
||||
|
||||
# Verify methods were called
|
||||
redis_semantic_cache._get_async_embedding.assert_called_once()
|
||||
redis_semantic_cache.llmcache.acheck.assert_called_once()
|
|
@ -0,0 +1,113 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
import litellm
|
||||
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
||||
StandardBuiltInToolCostTracking,
|
||||
)
|
||||
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
||||
from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
|
||||
# Test basic web search cost calculations
|
||||
def test_web_search_cost_low():
|
||||
web_search_options = WebSearchOptions(search_context_size="low")
|
||||
model_info = litellm.get_model_info("gpt-4o-search-preview")
|
||||
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
|
||||
web_search_options=web_search_options, model_info=model_info
|
||||
)
|
||||
|
||||
assert (
|
||||
cost == model_info["search_context_cost_per_query"]["search_context_size_low"]
|
||||
)
|
||||
|
||||
|
||||
def test_web_search_cost_medium():
|
||||
web_search_options = WebSearchOptions(search_context_size="medium")
|
||||
model_info = litellm.get_model_info("gpt-4o-search-preview")
|
||||
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
|
||||
web_search_options=web_search_options, model_info=model_info
|
||||
)
|
||||
|
||||
assert (
|
||||
cost
|
||||
== model_info["search_context_cost_per_query"]["search_context_size_medium"]
|
||||
)
|
||||
|
||||
|
||||
def test_web_search_cost_high():
|
||||
web_search_options = WebSearchOptions(search_context_size="high")
|
||||
model_info = litellm.get_model_info("gpt-4o-search-preview")
|
||||
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_web_search(
|
||||
web_search_options=web_search_options, model_info=model_info
|
||||
)
|
||||
|
||||
assert (
|
||||
cost == model_info["search_context_cost_per_query"]["search_context_size_high"]
|
||||
)
|
||||
|
||||
|
||||
# Test file search cost calculation
|
||||
def test_file_search_cost():
|
||||
file_search = FileSearchTool(type="file_search")
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_file_search(
|
||||
file_search=file_search
|
||||
)
|
||||
assert cost == 0.0025 # $2.50/1000 calls = 0.0025 per call
|
||||
|
||||
|
||||
# Test edge cases
|
||||
def test_none_inputs():
|
||||
# Test with None inputs
|
||||
assert (
|
||||
StandardBuiltInToolCostTracking.get_cost_for_web_search(
|
||||
web_search_options=None, model_info=None
|
||||
)
|
||||
== 0.0
|
||||
)
|
||||
assert (
|
||||
StandardBuiltInToolCostTracking.get_cost_for_file_search(file_search=None)
|
||||
== 0.0
|
||||
)
|
||||
|
||||
|
||||
# Test the main get_cost_for_built_in_tools method
|
||||
def test_get_cost_for_built_in_tools_web_search():
|
||||
model = "gpt-4"
|
||||
standard_built_in_tools_params = StandardBuiltInToolsParams(
|
||||
web_search_options=WebSearchOptions(search_context_size="medium")
|
||||
)
|
||||
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
|
||||
model=model,
|
||||
response_object=None,
|
||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||
)
|
||||
|
||||
assert isinstance(cost, float)
|
||||
|
||||
|
||||
def test_get_cost_for_built_in_tools_file_search():
|
||||
model = "gpt-4"
|
||||
standard_built_in_tools_params = StandardBuiltInToolsParams(
|
||||
file_search=FileSearchTool(type="file_search")
|
||||
)
|
||||
|
||||
cost = StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
|
||||
model=model,
|
||||
response_object=None,
|
||||
standard_built_in_tools_params=standard_built_in_tools_params,
|
||||
)
|
||||
|
||||
assert cost == 0.0025
|
34
tests/litellm/litellm_core_utils/test_litellm_logging.py
Normal file
34
tests/litellm/litellm_core_utils/test_litellm_logging.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
import time
|
||||
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def logging_obj():
|
||||
return LitellmLogging(
|
||||
model="bedrock/claude-3-5-sonnet-20240620-v1:0",
|
||||
messages=[{"role": "user", "content": "Hey"}],
|
||||
stream=True,
|
||||
call_type="completion",
|
||||
start_time=time.time(),
|
||||
litellm_call_id="12345",
|
||||
function_id="1245",
|
||||
)
|
||||
|
||||
|
||||
def test_get_masked_api_base(logging_obj):
|
||||
api_base = "https://api.openai.com/v1"
|
||||
masked_api_base = logging_obj._get_masked_api_base(api_base)
|
||||
assert masked_api_base == "https://api.openai.com/v1"
|
||||
assert type(masked_api_base) == str
|
|
@ -1,3 +1,4 @@
|
|||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
|
@ -11,7 +12,13 @@ sys.path.insert(
|
|||
0, os.path.abspath("../../../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import litellm
|
||||
from litellm.proxy._types import SpendLogsPayload
|
||||
from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
|
||||
from litellm.proxy.proxy_server import app, prisma_client
|
||||
from litellm.router import Router
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -400,3 +407,270 @@ async def test_ui_view_spend_logs_unauthorized(client):
|
|||
headers={"Authorization": "Bearer invalid-token"},
|
||||
)
|
||||
assert response.status_code == 401 or response.status_code == 403
|
||||
|
||||
|
||||
class TestSpendLogsPayload:
|
||||
@pytest.mark.asyncio
|
||||
async def test_spend_logs_payload_e2e(self):
|
||||
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
|
||||
# litellm._turn_on_debug()
|
||||
|
||||
with patch.object(
|
||||
litellm.proxy.proxy_server, "_set_spend_logs_payload"
|
||||
) as mock_client, patch.object(litellm.proxy.proxy_server, "prisma_client"):
|
||||
response = await litellm.acompletion(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
mock_response="Hello, world!",
|
||||
metadata={"user_api_key_end_user_id": "test_user_1"},
|
||||
)
|
||||
|
||||
assert response.choices[0].message.content == "Hello, world!"
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
mock_client.assert_called_once()
|
||||
|
||||
kwargs = mock_client.call_args.kwargs
|
||||
payload: SpendLogsPayload = kwargs["payload"]
|
||||
expected_payload = SpendLogsPayload(
|
||||
**{
|
||||
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
|
||||
"call_type": "acompletion",
|
||||
"api_key": "",
|
||||
"cache_hit": "None",
|
||||
"startTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"endTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"completionStartTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"model": "gpt-4o",
|
||||
"user": "",
|
||||
"team_id": "",
|
||||
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
|
||||
"cache_key": "Cache OFF",
|
||||
"spend": 0.00022500000000000002,
|
||||
"total_tokens": 30,
|
||||
"prompt_tokens": 10,
|
||||
"completion_tokens": 20,
|
||||
"request_tags": "[]",
|
||||
"end_user": "test_user_1",
|
||||
"api_base": "",
|
||||
"model_group": "",
|
||||
"model_id": "",
|
||||
"requester_ip_address": None,
|
||||
"custom_llm_provider": "openai",
|
||||
"messages": "{}",
|
||||
"response": "{}",
|
||||
}
|
||||
)
|
||||
|
||||
for key, value in expected_payload.items():
|
||||
if key in [
|
||||
"request_id",
|
||||
"startTime",
|
||||
"endTime",
|
||||
"completionStartTime",
|
||||
"endTime",
|
||||
]:
|
||||
assert payload[key] is not None
|
||||
else:
|
||||
assert (
|
||||
payload[key] == value
|
||||
), f"Expected {key} to be {value}, but got {payload[key]}"
|
||||
|
||||
def mock_anthropic_response(*args, **kwargs):
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "application/json"}
|
||||
mock_response.json.return_value = {
|
||||
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
|
||||
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
"role": "assistant",
|
||||
"stop_reason": "end_turn",
|
||||
"stop_sequence": None,
|
||||
"type": "message",
|
||||
"usage": {"input_tokens": 2095, "output_tokens": 503},
|
||||
}
|
||||
return mock_response
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_spend_logs_payload_success_log_with_api_base(self):
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
|
||||
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
|
||||
# litellm._turn_on_debug()
|
||||
|
||||
client = AsyncHTTPHandler()
|
||||
|
||||
with patch.object(
|
||||
litellm.proxy.proxy_server, "_set_spend_logs_payload"
|
||||
) as mock_client, patch.object(
|
||||
litellm.proxy.proxy_server, "prisma_client"
|
||||
), patch.object(
|
||||
client, "post", side_effect=self.mock_anthropic_response
|
||||
):
|
||||
response = await litellm.acompletion(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
metadata={"user_api_key_end_user_id": "test_user_1"},
|
||||
client=client,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.content == "Hi! My name is Claude."
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
mock_client.assert_called_once()
|
||||
|
||||
kwargs = mock_client.call_args.kwargs
|
||||
payload: SpendLogsPayload = kwargs["payload"]
|
||||
expected_payload = SpendLogsPayload(
|
||||
**{
|
||||
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
|
||||
"call_type": "acompletion",
|
||||
"api_key": "",
|
||||
"cache_hit": "None",
|
||||
"startTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"endTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"completionStartTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
"user": "",
|
||||
"team_id": "",
|
||||
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
|
||||
"cache_key": "Cache OFF",
|
||||
"spend": 0.01383,
|
||||
"total_tokens": 2598,
|
||||
"prompt_tokens": 2095,
|
||||
"completion_tokens": 503,
|
||||
"request_tags": "[]",
|
||||
"end_user": "test_user_1",
|
||||
"api_base": "https://api.anthropic.com/v1/messages",
|
||||
"model_group": "",
|
||||
"model_id": "",
|
||||
"requester_ip_address": None,
|
||||
"custom_llm_provider": "anthropic",
|
||||
"messages": "{}",
|
||||
"response": "{}",
|
||||
}
|
||||
)
|
||||
|
||||
for key, value in expected_payload.items():
|
||||
if key in [
|
||||
"request_id",
|
||||
"startTime",
|
||||
"endTime",
|
||||
"completionStartTime",
|
||||
"endTime",
|
||||
]:
|
||||
assert payload[key] is not None
|
||||
else:
|
||||
assert (
|
||||
payload[key] == value
|
||||
), f"Expected {key} to be {value}, but got {payload[key]}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_spend_logs_payload_success_log_with_router(self):
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
|
||||
litellm.callbacks = [_ProxyDBLogger(message_logging=False)]
|
||||
# litellm._turn_on_debug()
|
||||
|
||||
client = AsyncHTTPHandler()
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "my-anthropic-model-group",
|
||||
"litellm_params": {
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
},
|
||||
"model_info": {
|
||||
"id": "my-unique-model-id",
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
litellm.proxy.proxy_server, "_set_spend_logs_payload"
|
||||
) as mock_client, patch.object(
|
||||
litellm.proxy.proxy_server, "prisma_client"
|
||||
), patch.object(
|
||||
client, "post", side_effect=self.mock_anthropic_response
|
||||
):
|
||||
response = await router.acompletion(
|
||||
model="my-anthropic-model-group",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
metadata={"user_api_key_end_user_id": "test_user_1"},
|
||||
client=client,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.content == "Hi! My name is Claude."
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
mock_client.assert_called_once()
|
||||
|
||||
kwargs = mock_client.call_args.kwargs
|
||||
payload: SpendLogsPayload = kwargs["payload"]
|
||||
expected_payload = SpendLogsPayload(
|
||||
**{
|
||||
"request_id": "chatcmpl-34df56d5-4807-45c1-bb99-61e52586b802",
|
||||
"call_type": "acompletion",
|
||||
"api_key": "",
|
||||
"cache_hit": "None",
|
||||
"startTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 975883, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"endTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"completionStartTime": datetime.datetime(
|
||||
2025, 3, 24, 22, 2, 42, 989132, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
"user": "",
|
||||
"team_id": "",
|
||||
"metadata": '{"applied_guardrails": [], "batch_models": null, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": {"audio_tokens": null, "cached_tokens": 0, "text_tokens": null, "image_tokens": null}, "cache_creation_input_tokens": 0, "cache_read_input_tokens": 0}}',
|
||||
"cache_key": "Cache OFF",
|
||||
"spend": 0.01383,
|
||||
"total_tokens": 2598,
|
||||
"prompt_tokens": 2095,
|
||||
"completion_tokens": 503,
|
||||
"request_tags": "[]",
|
||||
"end_user": "test_user_1",
|
||||
"api_base": "https://api.anthropic.com/v1/messages",
|
||||
"model_group": "my-anthropic-model-group",
|
||||
"model_id": "my-unique-model-id",
|
||||
"requester_ip_address": None,
|
||||
"custom_llm_provider": "anthropic",
|
||||
"messages": "{}",
|
||||
"response": "{}",
|
||||
}
|
||||
)
|
||||
|
||||
for key, value in expected_payload.items():
|
||||
if key in [
|
||||
"request_id",
|
||||
"startTime",
|
||||
"endTime",
|
||||
"completionStartTime",
|
||||
"endTime",
|
||||
]:
|
||||
assert payload[key] is not None
|
||||
else:
|
||||
assert (
|
||||
payload[key] == value
|
||||
), f"Expected {key} to be {value}, but got {payload[key]}"
|
||||
|
|
|
@ -477,6 +477,25 @@ def test_supports_function_calling(model, expected_bool):
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, expected_bool",
|
||||
[
|
||||
("gpt-4o-mini-search-preview", True),
|
||||
("openai/gpt-4o-mini-search-preview", True),
|
||||
("gpt-4o-search-preview", True),
|
||||
("openai/gpt-4o-search-preview", True),
|
||||
("groq/deepseek-r1-distill-llama-70b", False),
|
||||
("groq/llama-3.3-70b-versatile", False),
|
||||
("codestral/codestral-latest", False),
|
||||
],
|
||||
)
|
||||
def test_supports_web_search(model, expected_bool):
|
||||
try:
|
||||
assert litellm.supports_web_search(model=model) == expected_bool
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_get_max_token_unit_test():
|
||||
"""
|
||||
More complete testing in `test_completion_cost.py`
|
||||
|
|
|
@ -794,7 +794,7 @@ def test_redis_cache_completion():
|
|||
response3 = completion(
|
||||
model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
|
||||
)
|
||||
response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
|
||||
response4 = completion(model="gpt-4o-mini", messages=messages, caching=True)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
|
@ -1690,20 +1690,12 @@ def test_cache_context_managers():
|
|||
print("VARS of litellm.cache", vars(litellm.cache))
|
||||
|
||||
|
||||
# test_cache_context_managers()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="beta test - new redis semantic cache")
|
||||
def test_redis_semantic_cache_completion():
|
||||
litellm.set_verbose = True
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding /reading from cache
|
||||
|
||||
print("testing semantic caching")
|
||||
litellm.cache = Cache(
|
||||
type="redis-semantic",
|
||||
|
@ -1718,33 +1710,30 @@ def test_redis_semantic_cache_completion():
|
|||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
"content": "write a one sentence poem about summer",
|
||||
}
|
||||
],
|
||||
max_tokens=20,
|
||||
)
|
||||
print(f"response1: {response1}")
|
||||
|
||||
random_number = random.randint(1, 100000)
|
||||
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
"content": "write a one sentence poem about summertime",
|
||||
}
|
||||
],
|
||||
max_tokens=20,
|
||||
)
|
||||
print(f"response2: {response1}")
|
||||
print(f"response2: {response2}")
|
||||
assert response1.id == response2.id
|
||||
|
||||
|
||||
# test_redis_cache_completion()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="beta test - new redis semantic cache")
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_semantic_cache_acompletion():
|
||||
litellm.set_verbose = True
|
||||
|
@ -1752,38 +1741,32 @@ async def test_redis_semantic_cache_acompletion():
|
|||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
|
||||
print("testing semantic caching")
|
||||
litellm.cache = Cache(
|
||||
type="redis-semantic",
|
||||
host=os.environ["REDIS_HOST"],
|
||||
port=os.environ["REDIS_PORT"],
|
||||
password=os.environ["REDIS_PASSWORD"],
|
||||
similarity_threshold=0.8,
|
||||
redis_semantic_cache_use_async=True,
|
||||
similarity_threshold=0.7,
|
||||
)
|
||||
response1 = await litellm.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
"content": "write a one sentence poem about summer",
|
||||
}
|
||||
],
|
||||
max_tokens=5,
|
||||
)
|
||||
print(f"response1: {response1}")
|
||||
|
||||
random_number = random.randint(1, 100000)
|
||||
response2 = await litellm.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
"content": "write a one sentence poem about summertime",
|
||||
}
|
||||
],
|
||||
max_tokens=5,
|
||||
|
|
|
@ -0,0 +1,175 @@
|
|||
{
|
||||
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
|
||||
"trace_id": null,
|
||||
"call_type": "acompletion",
|
||||
"cache_hit": null,
|
||||
"stream": true,
|
||||
"status": "success",
|
||||
"custom_llm_provider": "openai",
|
||||
"saved_cache_cost": 0.0,
|
||||
"startTime": "2025-01-24 09:20:46.847371",
|
||||
"endTime": "2025-01-24 09:20:46.851954",
|
||||
"completionStartTime": "2025-01-24 09:20:46.851954",
|
||||
"response_time": 0.007394075393676758,
|
||||
"model": "gpt-4o",
|
||||
"metadata": {
|
||||
"user_api_key_hash": null,
|
||||
"user_api_key_alias": null,
|
||||
"user_api_key_team_id": null,
|
||||
"user_api_key_org_id": null,
|
||||
"user_api_key_user_id": null,
|
||||
"user_api_key_team_alias": null,
|
||||
"user_api_key_user_email": null,
|
||||
"spend_logs_metadata": null,
|
||||
"requester_ip_address": null,
|
||||
"requester_metadata": null,
|
||||
"user_api_key_end_user_id": null,
|
||||
"prompt_management_metadata": null,
|
||||
"applied_guardrails": []
|
||||
},
|
||||
"cache_key": null,
|
||||
"response_cost": 0.00022500000000000002,
|
||||
"total_tokens": 30,
|
||||
"prompt_tokens": 10,
|
||||
"completion_tokens": 20,
|
||||
"request_tags": [],
|
||||
"end_user": "",
|
||||
"api_base": "",
|
||||
"model_group": "",
|
||||
"model_id": "",
|
||||
"requester_ip_address": null,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, world!"
|
||||
}
|
||||
],
|
||||
"response": {
|
||||
"id": "chatcmpl-2299b6a2-82a3-465a-b47c-04e685a2227f",
|
||||
"created": 1742855151,
|
||||
"model": "gpt-4o",
|
||||
"object": "chat.completion",
|
||||
"system_fingerprint": null,
|
||||
"choices": [
|
||||
{
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"message": {
|
||||
"content": "hi",
|
||||
"role": "assistant",
|
||||
"tool_calls": null,
|
||||
"function_call": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"completion_tokens": 20,
|
||||
"prompt_tokens": 10,
|
||||
"total_tokens": 30,
|
||||
"completion_tokens_details": null,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
},
|
||||
"model_parameters": {},
|
||||
"hidden_params": {
|
||||
"model_id": null,
|
||||
"cache_key": null,
|
||||
"api_base": "https://api.openai.com",
|
||||
"response_cost": 0.00022500000000000002,
|
||||
"additional_headers": {},
|
||||
"litellm_overhead_time_ms": null,
|
||||
"batch_models": null,
|
||||
"litellm_model_name": "gpt-4o"
|
||||
},
|
||||
"model_map_information": {
|
||||
"model_map_key": "gpt-4o",
|
||||
"model_map_value": {
|
||||
"key": "gpt-4o",
|
||||
"max_tokens": 16384,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 16384,
|
||||
"input_cost_per_token": 2.5e-06,
|
||||
"cache_creation_input_token_cost": null,
|
||||
"cache_read_input_token_cost": 1.25e-06,
|
||||
"input_cost_per_character": null,
|
||||
"input_cost_per_token_above_128k_tokens": null,
|
||||
"input_cost_per_query": null,
|
||||
"input_cost_per_second": null,
|
||||
"input_cost_per_audio_token": null,
|
||||
"input_cost_per_token_batches": 1.25e-06,
|
||||
"output_cost_per_token_batches": 5e-06,
|
||||
"output_cost_per_token": 1e-05,
|
||||
"output_cost_per_audio_token": null,
|
||||
"output_cost_per_character": null,
|
||||
"output_cost_per_token_above_128k_tokens": null,
|
||||
"output_cost_per_character_above_128k_tokens": null,
|
||||
"output_cost_per_second": null,
|
||||
"output_cost_per_image": null,
|
||||
"output_vector_size": null,
|
||||
"litellm_provider": "openai",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_response_schema": true,
|
||||
"supports_vision": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_assistant_prefill": false,
|
||||
"supports_prompt_caching": true,
|
||||
"supports_audio_input": false,
|
||||
"supports_audio_output": false,
|
||||
"supports_pdf_input": false,
|
||||
"supports_embedding_image_input": false,
|
||||
"supports_native_streaming": null,
|
||||
"supports_web_search": true,
|
||||
"search_context_cost_per_query": {
|
||||
"search_context_size_low": 0.03,
|
||||
"search_context_size_medium": 0.035,
|
||||
"search_context_size_high": 0.05
|
||||
},
|
||||
"tpm": null,
|
||||
"rpm": null,
|
||||
"supported_openai_params": [
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"logprobs",
|
||||
"top_logprobs",
|
||||
"max_tokens",
|
||||
"max_completion_tokens",
|
||||
"modalities",
|
||||
"prediction",
|
||||
"n",
|
||||
"presence_penalty",
|
||||
"seed",
|
||||
"stop",
|
||||
"stream",
|
||||
"stream_options",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"function_call",
|
||||
"functions",
|
||||
"max_retries",
|
||||
"extra_headers",
|
||||
"parallel_tool_calls",
|
||||
"audio",
|
||||
"response_format",
|
||||
"user"
|
||||
]
|
||||
}
|
||||
},
|
||||
"error_str": null,
|
||||
"error_information": {
|
||||
"error_code": "",
|
||||
"error_class": "",
|
||||
"llm_provider": "",
|
||||
"traceback": "",
|
||||
"error_message": ""
|
||||
},
|
||||
"response_cost_failure_debug_info": null,
|
||||
"guardrail_information": null,
|
||||
"standard_built_in_tools_params": {
|
||||
"web_search_options": null,
|
||||
"file_search": null
|
||||
}
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
import os
|
||||
import sys
|
||||
import traceback
|
||||
import uuid
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import Request
|
||||
from fastapi.routing import APIRoute
|
||||
|
||||
load_dotenv()
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
|
||||
# this file is to test litellm/proxy
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
|
||||
class TestCustomLogger(CustomLogger):
|
||||
def __init__(self):
|
||||
self.recorded_usage: Optional[Usage] = None
|
||||
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
standard_logging_payload = kwargs.get("standard_logging_object")
|
||||
self.standard_logging_payload = standard_logging_payload
|
||||
print(
|
||||
"standard_logging_payload",
|
||||
json.dumps(standard_logging_payload, indent=4, default=str),
|
||||
)
|
||||
|
||||
self.recorded_usage = Usage(
|
||||
prompt_tokens=standard_logging_payload.get("prompt_tokens"),
|
||||
completion_tokens=standard_logging_payload.get("completion_tokens"),
|
||||
total_tokens=standard_logging_payload.get("total_tokens"),
|
||||
)
|
||||
pass
|
||||
|
||||
|
||||
async def _setup_web_search_test():
|
||||
"""Helper function to setup common test requirements"""
|
||||
litellm._turn_on_debug()
|
||||
test_custom_logger = TestCustomLogger()
|
||||
litellm.callbacks = [test_custom_logger]
|
||||
return test_custom_logger
|
||||
|
||||
|
||||
async def _verify_web_search_cost(test_custom_logger, expected_context_size):
|
||||
"""Helper function to verify web search costs"""
|
||||
await asyncio.sleep(1)
|
||||
|
||||
standard_logging_payload = test_custom_logger.standard_logging_payload
|
||||
response_cost = standard_logging_payload.get("response_cost")
|
||||
assert response_cost is not None
|
||||
|
||||
# Calculate token cost
|
||||
model_map_information = standard_logging_payload["model_map_information"]
|
||||
model_map_value: ModelInfoBase = model_map_information["model_map_value"]
|
||||
total_token_cost = (
|
||||
standard_logging_payload["prompt_tokens"]
|
||||
* model_map_value["input_cost_per_token"]
|
||||
) + (
|
||||
standard_logging_payload["completion_tokens"]
|
||||
* model_map_value["output_cost_per_token"]
|
||||
)
|
||||
|
||||
# Verify total cost
|
||||
assert (
|
||||
response_cost
|
||||
== total_token_cost
|
||||
+ model_map_value["search_context_cost_per_query"][expected_context_size]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"web_search_options,expected_context_size",
|
||||
[
|
||||
(None, "search_context_size_medium"),
|
||||
({"search_context_size": "low"}, "search_context_size_low"),
|
||||
({"search_context_size": "high"}, "search_context_size_high"),
|
||||
],
|
||||
)
|
||||
async def test_openai_web_search_logging_cost_tracking(
|
||||
web_search_options, expected_context_size
|
||||
):
|
||||
"""Test web search cost tracking with different search context sizes"""
|
||||
test_custom_logger = await _setup_web_search_test()
|
||||
|
||||
request_kwargs = {
|
||||
"model": "openai/gpt-4o-search-preview",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What was a positive news story from today?"}
|
||||
],
|
||||
}
|
||||
if web_search_options is not None:
|
||||
request_kwargs["web_search_options"] = web_search_options
|
||||
|
||||
response = await litellm.acompletion(**request_kwargs)
|
||||
|
||||
await _verify_web_search_cost(test_custom_logger, expected_context_size)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"tools_config,expected_context_size,stream",
|
||||
[
|
||||
(
|
||||
[{"type": "web_search_preview", "search_context_size": "low"}],
|
||||
"search_context_size_low",
|
||||
True,
|
||||
),
|
||||
(
|
||||
[{"type": "web_search_preview", "search_context_size": "low"}],
|
||||
"search_context_size_low",
|
||||
False,
|
||||
),
|
||||
([{"type": "web_search_preview"}], "search_context_size_medium", True),
|
||||
([{"type": "web_search_preview"}], "search_context_size_medium", False),
|
||||
],
|
||||
)
|
||||
async def test_openai_responses_api_web_search_cost_tracking(
|
||||
tools_config, expected_context_size, stream
|
||||
):
|
||||
"""Test web search cost tracking with different search context sizes and streaming options"""
|
||||
test_custom_logger = await _setup_web_search_test()
|
||||
|
||||
response = await litellm.aresponses(
|
||||
model="openai/gpt-4o",
|
||||
input=[
|
||||
{"role": "user", "content": "What was a positive news story from today?"}
|
||||
],
|
||||
tools=tools_config,
|
||||
stream=stream,
|
||||
)
|
||||
if stream is True:
|
||||
async for chunk in response:
|
||||
print("chunk", chunk)
|
||||
else:
|
||||
print("response", response)
|
||||
|
||||
await _verify_web_search_cost(test_custom_logger, expected_context_size)
|
|
@ -6,6 +6,7 @@ import sys
|
|||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
|
||||
import asyncio
|
||||
import litellm
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
|
@ -48,8 +49,15 @@ def assert_gcs_pubsub_request_matches_expected(
|
|||
expected_request_body = json.load(f)
|
||||
|
||||
# Replace dynamic values in actual request body
|
||||
time_fields = ["startTime", "endTime", "completionStartTime", "request_id"]
|
||||
for field in time_fields:
|
||||
dynamic_fields = [
|
||||
"startTime",
|
||||
"endTime",
|
||||
"completionStartTime",
|
||||
"request_id",
|
||||
"id",
|
||||
"response_time",
|
||||
]
|
||||
for field in dynamic_fields:
|
||||
if field in actual_request_body:
|
||||
actual_request_body[field] = expected_request_body[field]
|
||||
|
||||
|
@ -59,6 +67,55 @@ def assert_gcs_pubsub_request_matches_expected(
|
|||
), f"Difference in request bodies: {json.dumps(actual_request_body, indent=2)} != {json.dumps(expected_request_body, indent=2)}"
|
||||
|
||||
|
||||
def assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
|
||||
actual_request_body: dict,
|
||||
expected_file_name: str,
|
||||
):
|
||||
"""
|
||||
Helper function to compare actual GCS PubSub request body with expected JSON file.
|
||||
|
||||
Args:
|
||||
actual_request_body (dict): The actual request body received from the API call
|
||||
expected_file_name (str): Name of the JSON file containing expected request body
|
||||
"""
|
||||
# Get the current directory and read the expected request body
|
||||
pwd = os.path.dirname(os.path.realpath(__file__))
|
||||
expected_body_path = os.path.join(pwd, "gcs_pub_sub_body", expected_file_name)
|
||||
|
||||
with open(expected_body_path, "r") as f:
|
||||
expected_request_body = json.load(f)
|
||||
|
||||
# Replace dynamic values in actual request body
|
||||
FIELDS_TO_VALIDATE = [
|
||||
"custom_llm_provider",
|
||||
"hidden_params",
|
||||
"messages",
|
||||
"response",
|
||||
"model",
|
||||
"status",
|
||||
"stream",
|
||||
]
|
||||
|
||||
actual_request_body["response"]["id"] = expected_request_body["response"]["id"]
|
||||
actual_request_body["response"]["created"] = expected_request_body["response"][
|
||||
"created"
|
||||
]
|
||||
|
||||
for field in FIELDS_TO_VALIDATE:
|
||||
assert field in actual_request_body
|
||||
|
||||
FIELDS_EXISTENCE_CHECKS = [
|
||||
"response_cost",
|
||||
"response_time",
|
||||
"completion_tokens",
|
||||
"prompt_tokens",
|
||||
"total_tokens",
|
||||
]
|
||||
|
||||
for field in FIELDS_EXISTENCE_CHECKS:
|
||||
assert field in actual_request_body
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_gcs_pub_sub():
|
||||
# Create a mock for the async_httpx_client's post method
|
||||
|
@ -102,6 +159,61 @@ async def test_async_gcs_pub_sub():
|
|||
|
||||
decoded_message = base64.b64decode(encoded_message).decode("utf-8")
|
||||
|
||||
# Parse the JSON string into a dictionary
|
||||
actual_request = json.loads(decoded_message)
|
||||
print("##########\n")
|
||||
print(json.dumps(actual_request, indent=4))
|
||||
print("##########\n")
|
||||
# Verify the request body matches expected format
|
||||
assert_gcs_pubsub_request_matches_expected_standard_logging_payload(
|
||||
actual_request, "standard_logging_payload.json"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_gcs_pub_sub_v1():
|
||||
# Create a mock for the async_httpx_client's post method
|
||||
litellm.gcs_pub_sub_use_v1 = True
|
||||
mock_post = AsyncMock()
|
||||
mock_post.return_value.status_code = 202
|
||||
mock_post.return_value.text = "Accepted"
|
||||
|
||||
# Initialize the GcsPubSubLogger and set the mock
|
||||
gcs_pub_sub_logger = GcsPubSubLogger(flush_interval=1)
|
||||
gcs_pub_sub_logger.async_httpx_client.post = mock_post
|
||||
|
||||
mock_construct_request_headers = AsyncMock()
|
||||
mock_construct_request_headers.return_value = {"Authorization": "Bearer mock_token"}
|
||||
gcs_pub_sub_logger.construct_request_headers = mock_construct_request_headers
|
||||
litellm.callbacks = [gcs_pub_sub_logger]
|
||||
|
||||
# Make the completion call
|
||||
response = await litellm.acompletion(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
mock_response="hi",
|
||||
)
|
||||
|
||||
await asyncio.sleep(3) # Wait for async flush
|
||||
|
||||
# Assert httpx post was called
|
||||
mock_post.assert_called_once()
|
||||
|
||||
# Get the actual request body from the mock
|
||||
actual_url = mock_post.call_args[1]["url"]
|
||||
print("sent to url", actual_url)
|
||||
assert (
|
||||
actual_url
|
||||
== "https://pubsub.googleapis.com/v1/projects/reliableKeys/topics/litellmDB:publish"
|
||||
)
|
||||
actual_request = mock_post.call_args[1]["json"]
|
||||
|
||||
# Extract and decode the base64 encoded message
|
||||
encoded_message = actual_request["messages"][0]["data"]
|
||||
import base64
|
||||
|
||||
decoded_message = base64.b64decode(encoded_message).decode("utf-8")
|
||||
|
||||
# Parse the JSON string into a dictionary
|
||||
actual_request = json.loads(decoded_message)
|
||||
print("##########\n")
|
||||
|
|
|
@ -21,16 +21,18 @@ sys.path.insert(
|
|||
import litellm
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
from litellm.types.utils import StandardLoggingPayload, Usage
|
||||
from litellm.types.utils import StandardLoggingPayload, Usage, ModelInfoBase
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
|
||||
class TestCustomLogger(CustomLogger):
|
||||
def __init__(self):
|
||||
self.recorded_usage: Optional[Usage] = None
|
||||
self.standard_logging_payload: Optional[StandardLoggingPayload] = None
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
standard_logging_payload = kwargs.get("standard_logging_object")
|
||||
self.standard_logging_payload = standard_logging_payload
|
||||
print(
|
||||
"standard_logging_payload",
|
||||
json.dumps(standard_logging_payload, indent=4, default=str),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue