Merge branch 'main' into fix

This commit is contained in:
மனோஜ்குமார் பழனிச்சாமி 2025-04-23 08:19:42 +05:30 committed by GitHub
commit 75021b927b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
398 changed files with 20447 additions and 5096 deletions

View file

@ -20,6 +20,8 @@ REPLICATE_API_TOKEN = ""
ANTHROPIC_API_KEY = ""
# Infisical
INFISICAL_TOKEN = ""
# INFINITY
INFINITY_API_KEY = ""
# Development Configs
LITELLM_MASTER_KEY = "sk-1234"

3
.gitignore vendored
View file

@ -73,6 +73,7 @@ tests/local_testing/log.txt
.codegpt
litellm/proxy/_new_new_secret_config.yaml
litellm/proxy/custom_guardrail.py
.mypy_cache/*
litellm/proxy/_experimental/out/404.html
litellm/proxy/_experimental/out/404.html
litellm/proxy/_experimental/out/model_hub.html
@ -85,3 +86,5 @@ litellm/proxy/db/migrations/0_init/migration.sql
litellm/proxy/db/migrations/*
litellm/proxy/migrations/*config.yaml
litellm/proxy/migrations/*
config.yaml
tests/litellm/litellm_core_utils/llm_cost_calc/log.txt

View file

@ -0,0 +1,83 @@
# 🖇️ AgentOps - LLM Observability Platform
:::tip
This is community maintained. Please make an issue if you run into a bug:
https://github.com/BerriAI/litellm
:::
[AgentOps](https://docs.agentops.ai) is an observability platform that enables tracing and monitoring of LLM calls, providing detailed insights into your AI operations.
## Using AgentOps with LiteLLM
LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily integrate AgentOps for comprehensive tracing and monitoring of your LLM operations.
### Integration
Use just a few lines of code to instantly trace your responses **across all providers** with AgentOps:
Get your AgentOps API Keys from https://app.agentops.ai/
```python
import litellm
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# Make your LLM calls as usual
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
Complete Code:
```python
import os
from litellm import completion
# Set env variables
os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# OpenAI call
response = completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
)
print(response)
```
### Configuration Options
The AgentOps integration can be configured through environment variables:
- `AGENTOPS_API_KEY` (str, optional): Your AgentOps API key
- `AGENTOPS_ENVIRONMENT` (str, optional): Deployment environment (defaults to "production")
- `AGENTOPS_SERVICE_NAME` (str, optional): Service name for tracing (defaults to "agentops")
### Advanced Usage
You can configure additional settings through environment variables:
```python
import os
# Configure AgentOps settings
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
os.environ["AGENTOPS_ENVIRONMENT"] = "staging"
os.environ["AGENTOPS_SERVICE_NAME"] = "my-service"
# Enable AgentOps tracing
litellm.success_callback = ["agentops"]
```
### Support
For issues or questions, please refer to:
- [AgentOps Documentation](https://docs.agentops.ai)
- [LiteLLM Documentation](https://docs.litellm.ai)

View file

@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |

View file

@ -0,0 +1,217 @@
# Mistral
Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀
#### **Example Usage**
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
Supports **ALL** Mistral Endpoints (including streaming).
## Quick Start
Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post)
1. Add MISTRAL_API_KEY to your environment
```bash
export MISTRAL_API_KEY="sk-1234"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Mistral `/ocr` endpoint
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
## Examples
Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://api.mistral.ai/v1` | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $MISTRAL_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: OCR endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_API_KEY' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
#### Direct Mistral API Call
```bash
curl https://api.mistral.ai/v1/ocr \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${MISTRAL_API_KEY}" \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": "https://arxiv.org/pdf/2201.04234"
},
"include_image_base64": true
}'
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
#### Direct Mistral API Call
```bash
curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export MISTRAL_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -13,6 +13,15 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
## Supported Endpoints
LiteLLM supports 2 vertex ai passthrough routes:
1. `/vertex_ai` → routes to `https://{vertex_location}-aiplatform.googleapis.com/`
2. `/vertex_ai/discovery` → routes to [`https://discoveryengine.googleapis.com`](https://discoveryengine.googleapis.com/)
## How to use
Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai`
LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through:

View file

@ -0,0 +1,185 @@
# VLLM
Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀
#### **Example Usage**
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
Supports **ALL** VLLM Endpoints (including streaming).
## Quick Start
Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html)
1. Add HOSTED VLLM API BASE to your environment
```bash
export HOSTED_VLLM_API_BASE="https://my-vllm-server.com"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the VLLM `/metrics` endpoint
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
## Examples
Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://my-vllm-server.com` | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $VLLM_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Metrics endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
```
#### Direct VLLM API Call
```bash
curl -L -X GET 'https://my-vllm-server.com/metrics' \
-H 'Content-Type: application/json' \
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
#### Direct VLLM API Call
```bash
curl -L -X POST 'https://my-vllm-server.com/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export HOSTED_VLLM_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -1002,8 +1002,125 @@ Expected Response:
```
## **Azure Responses API**
| Property | Details |
|-------|-------|
| Description | Azure OpenAI Responses API |
| `custom_llm_provider` on LiteLLM | `azure/` |
| Supported Operations | `/v1/responses`|
| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) |
| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests |
| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) |
## Usage
## Create a model response
<Tabs>
<TabItem value="litellm-sdk" label="LiteLLM SDK">
#### Non-streaming
```python showLineNumbers title="Azure Responses API"
import litellm
# Non-streaming response
response = litellm.responses(
model="azure/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100,
api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
api_base="https://litellm8397336933.openai.azure.com/",
api_version="2023-03-15-preview",
)
print(response)
```
#### Streaming
```python showLineNumbers title="Azure Responses API"
import litellm
# Streaming response
response = litellm.responses(
model="azure/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True,
api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
api_base="https://litellm8397336933.openai.azure.com/",
api_version="2023-03-15-preview",
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Azure Responses API"
model_list:
- model_name: o1-pro
litellm_params:
model: azure/o1-pro
api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
api_base: https://litellm8397336933.openai.azure.com/
api_version: 2023-03-15-preview
```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
#### Non-streaming
```python showLineNumbers
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>

View file

@ -39,14 +39,164 @@ response = completion(
- temperature
- top_p
- max_tokens
- max_completion_tokens
- stream
- tools
- tool_choice
- functions
- response_format
- n
- stop
- logprobs
- frequency_penalty
- modalities
- reasoning_content
**Anthropic Params**
- thinking (used to set max budget tokens across anthropic/gemini models)
[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70)
## Usage - Thinking / `reasoning_content`
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: gemini/gemini-2.5-flash-preview-04-17
api_key: os.environ/GEMINI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = litellm.completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "gemini/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
## Passing Gemini Specific Params
### Response schema

View file

@ -3,18 +3,17 @@ import TabItem from '@theme/TabItem';
# Infinity
| Property | Details |
|-------|-------|
| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip|
| Provider Route on LiteLLM | `infinity/` |
| Supported Operations | `/rerank` |
| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) |
| Property | Details |
| ------------------------- | ---------------------------------------------------------------------------------------------------------- |
| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip |
| Provider Route on LiteLLM | `infinity/` |
| Supported Operations | `/rerank`, `/embeddings` |
| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) |
## **Usage - LiteLLM Python SDK**
```python
from litellm import rerank
from litellm import rerank, embedding
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
@ -39,8 +38,8 @@ model_list:
- model_name: custom-infinity-rerank
litellm_params:
model: infinity/rerank
api_key: os.environ/INFINITY_API_KEY
api_base: https://localhost:8080
api_key: os.environ/INFINITY_API_KEY
```
Start litellm
@ -51,7 +50,9 @@ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Test request
## Test request:
### Rerank
```bash
curl http://0.0.0.0:4000/rerank \
@ -70,15 +71,14 @@ curl http://0.0.0.0:4000/rerank \
}'
```
#### Supported Cohere Rerank API Params
## Supported Cohere Rerank API Params
| Param | Type | Description |
|-------|-------|-------|
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
| Param | Type | Description |
| ------------------ | ----------- | ----------------------------------------------- |
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
### Usage - Return Documents
@ -138,6 +138,7 @@ response = rerank(
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
@ -161,7 +162,7 @@ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
3. Test it!
```bash
curl http://0.0.0.0:4000/rerank \
@ -179,6 +180,121 @@ curl http://0.0.0.0:4000/rerank \
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}'
```
</TabItem>
</Tabs>
## Embeddings
LiteLLM provides an OpenAI api compatible `/embeddings` endpoint for embedding calls.
**Setup**
Add this to your litellm proxy config.yaml
```yaml
model_list:
- model_name: custom-infinity-embedding
litellm_params:
model: infinity/provider/custom-embedding-v1
api_base: http://localhost:8080
api_key: os.environ/INFINITY_API_KEY
```
### Test request:
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
#### Supported Embedding API Params
| Param | Type | Description |
| ----------------- | ----------- | ----------------------------------------------------------- |
| `model` | `str` | The embedding model to use |
| `input` | `list[str]` | The text inputs to generate embeddings for |
| `encoding_format` | `str` | The format to return embeddings in (e.g. "float", "base64") |
| `modality` | `str` | The type of input (e.g. "text", "image", "audio") |
### Usage - Basic Examples
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import embedding
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = embedding(
model="infinity/bge-small",
input=["good morning from litellm"]
)
print(response.data[0]['embedding'])
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
</TabItem>
</Tabs>
### Usage - OpenAI Client
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="<LITELLM_MASTER_KEY>",
base_url="<LITELLM_URL>"
)
response = client.embeddings.create(
model="bge-small",
input=["The food was delicious and the waiter..."],
encoding_format="float"
)
print(response.data[0].embedding)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "bge-small",
"input": ["The food was delicious and the waiter..."],
"encoding_format": "float"
}'
```
</TabItem>
</Tabs>

View file

@ -163,6 +163,12 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` |
| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` |
| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` |
| o4-mini | `response = completion(model="o4-mini", messages=messages)` |
| o3-mini | `response = completion(model="o3-mini", messages=messages)` |
| o3 | `response = completion(model="o3", messages=messages)` |
| o1-mini | `response = completion(model="o1-mini", messages=messages)` |
| o1-preview | `response = completion(model="o1-preview", messages=messages)` |
| gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` |

View file

@ -542,6 +542,154 @@ print(resp)
```
### **Thinking / `reasoning_content`**
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
resp = completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: vertex_ai/gemini-2.5-flash-preview-04-17
vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"}
vertex_project: "project-id"
vertex_location: "us-central1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
#### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
response = litellm.completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "vertex_ai/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
### **Context Caching**
Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.).

View file

@ -161,6 +161,120 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
<Tabs>
<TabItem value="files_message" label="(Unified) Files Message">
Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type.
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "file", "file": {"file_id": video_url}},
```
2. Pass the video data as base64
```
{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "file",
"file": {
"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
]
# call vllm
os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co"
os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=messages,
)
# call gemini
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
response = completion(
model="gemini/gemini-1.5-flash", # pass the gemini model name
messages=messages,
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
- model_name: my-gemini-model
litellm_params:
model: gemini/gemini-1.5-flash # add gemini/ prefix to route as Google AI Studio provider
api_key: os.environ/GEMINI_API_KEY
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="video_url" label="(VLLM-specific) Video Message">
Use this to send a video url to VLLM in it's native message format (`video_url`).
There are two ways to send a video url to VLLM:
1. Pass the video url directly
@ -249,6 +363,10 @@ curl -X POST http://0.0.0.0:4000/chat/completions \
</Tabs>
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package`
### Using - `litellm.completion`

View file

@ -299,6 +299,9 @@ router_settings:
|------|-------------|
| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
| AGENTOPS_ENVIRONMENT | Environment for AgentOps logging integration
| AGENTOPS_API_KEY | API Key for AgentOps logging integration
| AGENTOPS_SERVICE_NAME | Service Name for AgentOps logging integration
| AISPEND_ACCOUNT_ID | Account ID for AI Spend
| AISPEND_API_KEY | API Key for AI Spend
| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
@ -323,6 +326,9 @@ router_settings:
| AZURE_AUTHORITY_HOST | Azure authority host URL
| AZURE_CLIENT_ID | Client ID for Azure services
| AZURE_CLIENT_SECRET | Client secret for Azure services
| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
| AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow
| AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow
| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
| AZURE_KEY_VAULT_URI | URI for Azure Key Vault
| AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging
@ -331,7 +337,6 @@ router_settings:
| AZURE_STORAGE_TENANT_ID | The Application Tenant ID to use for Authentication to Azure Blob Storage logging
| AZURE_STORAGE_CLIENT_ID | The Application Client ID to use for Authentication to Azure Blob Storage logging
| AZURE_STORAGE_CLIENT_SECRET | The Application Client Secret to use for Authentication to Azure Blob Storage logging
| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
| BRAINTRUST_API_KEY | API key for Braintrust integration
| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
@ -433,6 +438,7 @@ router_settings:
| LITERAL_BATCH_SIZE | Batch size for Literal operations
| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
| LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests
| LITELLM_EMAIL | Email associated with LiteLLM account
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
@ -446,6 +452,8 @@ router_settings:
| LITELLM_TOKEN | Access token for LiteLLM integration
| LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging
| LOGFIRE_TOKEN | Token for Logfire logging service
| MISTRAL_API_BASE | Base URL for Mistral API
| MISTRAL_API_KEY | API key for Mistral API
| MICROSOFT_CLIENT_ID | Client ID for Microsoft services
| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
| MICROSOFT_TENANT | Tenant ID for Microsoft Azure

View file

@ -862,7 +862,7 @@ Add the following to your env
```shell
OTEL_EXPORTER="otlp_http"
OTEL_ENDPOINT="http:/0.0.0.0:4317"
OTEL_ENDPOINT="http://0.0.0.0:4317"
OTEL_HEADERS="x-honeycomb-team=<your-api-key>" # Optional
```
@ -2501,4 +2501,4 @@ litellm_settings:
:::info
`thresholds` are not required by default, but you can tune the values to your needs.
Default values is `4` for all categories
::: -->
::: -->

View file

@ -0,0 +1,108 @@
# Model Discovery
Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models.
## Supported Models
- Fireworks AI
- OpenAI
- Gemini
- LiteLLM Proxy
- Topaz
- Anthropic
- XAI
- VLLM
- Vertex AI
### Usage
**1. Setup config.yaml**
```yaml
model_list:
- model_name: xai/*
litellm_params:
model: xai/*
api_key: os.environ/XAI_API_KEY
litellm_settings:
check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**3. Call `/v1/models`**
```bash
curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY"
```
Expected response
```json
{
"data": [
{
"id": "xai/grok-2-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-vision-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-vision-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-image-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
}
],
"object": "list"
}
```

View file

@ -16,6 +16,8 @@ Supported Providers:
- Vertex AI (Anthropic) (`vertexai/`)
- OpenRouter (`openrouter/`)
- XAI (`xai/`)
- Google AI Studio (`google/`)
- Vertex AI (`vertex_ai/`)
LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.
@ -23,7 +25,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
"thinking_blocks": [ # only returned for Anthropic models
{
"type": "thinking",
"thinking": "The capital of France is Paris.",

View file

@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http
| Fallbacks | ✅ | Works between supported models |
| Loadbalancing | ✅ | Works between supported models |
| Supported LiteLLM Versions | 1.63.8+ | |
| Supported LLM providers | `openai` | |
| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. |
## Usage
## Create a model response
### LiteLLM Python SDK
<Tabs>
<TabItem value="litellm-sdk" label="LiteLLM SDK">
<TabItem value="openai" label="OpenAI">
#### Non-streaming
```python
```python showLineNumbers title="OpenAI Non-streaming Response"
import litellm
# Non-streaming response
response = litellm.responses(
model="o1-pro",
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
@ -38,12 +38,12 @@ print(response)
```
#### Streaming
```python
```python showLineNumbers title="OpenAI Streaming Response"
import litellm
# Streaming response
response = litellm.responses(
model="o1-pro",
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
@ -53,58 +53,169 @@ for event in response:
```
</TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
First, add this to your litellm proxy config.yaml:
```yaml
model_list:
- model_name: o1-pro
litellm_params:
model: openai/o1-pro
api_key: os.environ/OPENAI_API_KEY
```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
<TabItem value="anthropic" label="Anthropic">
#### Non-streaming
```python
from openai import OpenAI
```python showLineNumbers title="Anthropic Non-streaming Response"
import litellm
import os
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Non-streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python
from openai import OpenAI
```python showLineNumbers title="Anthropic Streaming Response"
import litellm
import os
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Streaming response
response = client.responses.create(
model="o1-pro",
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
#### Non-streaming
```python showLineNumbers title="Vertex AI Non-streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Non-streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Non-streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Non-streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
#### Non-streaming
```python showLineNumbers title="Google AI Studio Non-streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Non-streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
@ -115,3 +226,408 @@ for event in response:
</TabItem>
</Tabs>
### LiteLLM Proxy with OpenAI SDK
First, set up and start your LiteLLM proxy server.
```bash title="Start LiteLLM Proxy Server"
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
<Tabs>
<TabItem value="openai" label="OpenAI">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="OpenAI Proxy Configuration"
model_list:
- model_name: openai/o1-pro
litellm_params:
model: openai/o1-pro
api_key: os.environ/OPENAI_API_KEY
```
#### Non-streaming
```python showLineNumbers title="OpenAI Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="OpenAI Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="anthropic" label="Anthropic">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Anthropic Proxy Configuration"
model_list:
- model_name: anthropic/claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Anthropic Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Anthropic Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Vertex AI Proxy Configuration"
model_list:
- model_name: vertex_ai/gemini-1.5-pro
litellm_params:
model: vertex_ai/gemini-1.5-pro
vertex_project: your-gcp-project-id
vertex_location: us-central1
```
#### Non-streaming
```python showLineNumbers title="Vertex AI Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="AWS Bedrock Proxy Configuration"
model_list:
- model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
litellm_params:
model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
aws_region_name: us-west-2
```
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Google AI Studio Proxy Configuration"
model_list:
- model_name: gemini/gemini-1.5-flash
litellm_params:
model: gemini/gemini-1.5-flash
api_key: os.environ/GEMINI_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>
## Supported Responses API Parameters
| Provider | Supported Parameters |
|----------|---------------------|
| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
## Load Balancing with Routing Affinity
When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response.
#### Example Usage
<Tabs>
<TabItem value="python-sdk" label="Python SDK">
```python showLineNumbers title="Python SDK with Routing Affinity"
import litellm
# Set up router with multiple deployments of the same model
router = litellm.Router(
model_list=[
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-1",
"api_version": "2024-06-01",
"api_base": "https://endpoint1.openai.azure.com",
},
},
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-2",
"api_version": "2024-06-01",
"api_base": "https://endpoint2.openai.azure.com",
},
},
],
optional_pre_call_checks=["responses_api_deployment_check"],
)
# Initial request
response = await router.aresponses(
model="azure-gpt4-turbo",
input="Hello, who are you?",
truncation="auto",
)
# Store the response ID
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = await router.aresponses(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
truncation="auto",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
<TabItem value="proxy-server" label="Proxy Server">
#### 1. Setup routing affinity on proxy config.yaml
To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml.
```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity"
model_list:
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-1
api_version: 2024-06-01
api_base: https://endpoint1.openai.azure.com
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-2
api_version: 2024-06-01
api_base: https://endpoint2.openai.azure.com
router_settings:
optional_pre_call_checks: ["responses_api_deployment_check"]
```
#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy
```python showLineNumbers title="OpenAI Client with Proxy Server"
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:4000",
api_key="your-api-key"
)
# Initial request
response = client.responses.create(
model="azure-gpt4-turbo",
input="Hello, who are you?"
)
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = client.responses.create(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,146 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Using LiteLLM with OpenAI Codex
This guide walks you through connecting OpenAI Codex to LiteLLM. Using LiteLLM with Codex allows teams to:
- Access 100+ LLMs through the Codex interface
- Use powerful models like Gemini through a familiar interface
- Track spend and usage with LiteLLM's built-in analytics
- Control model access with virtual keys
<Image img={require('../../img/litellm_codex.gif')} />
## Quickstart
:::info
Requires LiteLLM v1.66.3.dev5 and higher
:::
Make sure to set up LiteLLM with the [LiteLLM Getting Started Guide](../proxy/docker_quick_start.md).
## 1. Install OpenAI Codex
Install the OpenAI Codex CLI tool globally using npm:
<Tabs>
<TabItem value="npm" label="npm">
```bash showLineNumbers
npm i -g @openai/codex
```
</TabItem>
<TabItem value="yarn" label="yarn">
```bash showLineNumbers
yarn global add @openai/codex
```
</TabItem>
</Tabs>
## 2. Start LiteLLM Proxy
<Tabs>
<TabItem value="docker" label="Docker">
```bash showLineNumbers
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml
```
</TabItem>
<TabItem value="pip" label="LiteLLM CLI">
```bash showLineNumbers
litellm --config /path/to/config.yaml
```
</TabItem>
</Tabs>
LiteLLM should now be running on [http://localhost:4000](http://localhost:4000)
## 3. Configure LiteLLM for Model Routing
Ensure your LiteLLM Proxy is properly configured to route to your desired models. Create a `litellm_config.yaml` file with the following content:
```yaml showLineNumbers
model_list:
- model_name: o3-mini
litellm_params:
model: openai/o3-mini
api_key: os.environ/OPENAI_API_KEY
- model_name: claude-3-7-sonnet-latest
litellm_params:
model: anthropic/claude-3-7-sonnet-latest
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: gemini-2.0-flash
litellm_params:
model: gemini/gemini-2.0-flash
api_key: os.environ/GEMINI_API_KEY
litellm_settings:
drop_params: true
```
This configuration enables routing to specific OpenAI, Anthropic, and Gemini models with explicit names.
## 4. Configure Codex to Use LiteLLM Proxy
Set the required environment variables to point Codex to your LiteLLM Proxy:
```bash
# Point to your LiteLLM Proxy server
export OPENAI_BASE_URL=http://0.0.0.0:4000
# Use your LiteLLM API key (if you've set up authentication)
export OPENAI_API_KEY="sk-1234"
```
## 5. Run Codex with Gemini
With everything configured, you can now run Codex with Gemini:
```bash showLineNumbers
codex --model gemini-2.0-flash --full-auto
```
<Image img={require('../../img/litellm_codex.gif')} />
The `--full-auto` flag allows Codex to automatically generate code without additional prompting.
## 6. Advanced Options
### Using Different Models
You can use any model configured in your LiteLLM proxy:
```bash
# Use Claude models
codex --model claude-3-7-sonnet-latest
# Use Google AI Studio Gemini models
codex --model gemini/gemini-2.0-flash
```
## Troubleshooting
- If you encounter connection issues, ensure your LiteLLM Proxy is running and accessible at the specified URL
- Verify your LiteLLM API key is valid if you're using authentication
- Check that your model routing configuration is correct
- For model-specific errors, ensure the model is properly configured in your LiteLLM setup
## Additional Resources
- [LiteLLM Docker Quick Start Guide](../proxy/docker_quick_start.md)
- [OpenAI Codex GitHub Repository](https://github.com/openai/codex)
- [LiteLLM Virtual Keys and Authentication](../proxy/virtual_keys.md)

View file

@ -0,0 +1,128 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Auto-Inject Prompt Caching Checkpoints
Reduce costs by up to 90% by using LiteLLM to auto-inject prompt caching checkpoints.
<Image img={require('../../img/auto_prompt_caching.png')} style={{ width: '800px', height: 'auto' }} />
## How it works
LiteLLM can automatically inject prompt caching checkpoints into your requests to LLM providers. This allows:
- **Cost Reduction**: Long, static parts of your prompts can be cached to avoid repeated processing
- **No need to modify your application code**: You can configure the auto-caching behavior in the LiteLLM UI or in the `litellm config.yaml` file.
## Configuration
You need to specify `cache_control_injection_points` in your model configuration. This tells LiteLLM:
1. Where to add the caching directive (`location`)
2. Which message to target (`role`)
LiteLLM will then automatically add a `cache_control` directive to the specified messages in your requests:
```json
"cache_control": {
"type": "ephemeral"
}
```
## Usage Example
In this example, we'll configure caching for system messages by adding the directive to all messages with `role: system`.
<Tabs>
<TabItem value="litellm config.yaml" label="litellm config.yaml">
```yaml showLineNumbers title="litellm config.yaml"
model_list:
- model_name: anthropic-auto-inject-cache-system-message
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
cache_control_injection_points:
- location: message
role: system
```
</TabItem>
<TabItem value="UI" label="LiteLLM UI">
On the LiteLLM UI, you can specify the `cache_control_injection_points` in the `Advanced Settings` tab when adding a model.
<Image img={require('../../img/ui_auto_prompt_caching.png')}/>
</TabItem>
</Tabs>
## Detailed Example
### 1. Original Request to LiteLLM
In this example, we have a very long, static system message and a varying user message. It's efficient to cache the system message since it rarely changes.
```json
{
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the main topic of this legal document?"
}
]
}
]
}
```
### 2. LiteLLM's Modified Request
LiteLLM auto-injects the caching directive into the system message based on our configuration:
```json
{
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question.",
"cache_control": {"type": "ephemeral"}
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the main topic of this legal document?"
}
]
}
]
}
```
When the model provider processes this request, it will recognize the caching directive and only process the system message once, caching it for subsequent requests.

View file

@ -0,0 +1,74 @@
import Image from '@theme/IdealImage';
# SCIM with LiteLLM
Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM.
This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints.
### Supported SSO Providers for SCIM
Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints.
- Microsoft Entra ID (Azure AD)
- Okta
- Google Workspace
- OneLogin
- Keycloak
- Auth0
## 1. Get your SCIM Tenant URL and Bearer Token
On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints.
<Image img={require('../../img/scim_2.png')} style={{ width: '800px', height: 'auto' }} />
## 2. Connect your IDP to LiteLLM SCIM Endpoints
On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`.
On this page, paste in your litellm scim tenant url and bearer token.
Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints.
<Image img={require('../../img/scim_4.png')} style={{ width: '800px', height: 'auto' }} />
## 3. Test SCIM Connection
### 3.1 Assign the group to your LiteLLM Enterprise App
On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app
<Image img={require('../../img/msft_enterprise_app.png')} style={{ width: '800px', height: 'auto' }} />
<br />
<br />
Once you've selected your litellm app, click on `Users and Groups` > `Add user/group`
<Image img={require('../../img/msft_enterprise_assign_group.png')} style={{ width: '800px', height: 'auto' }} />
<br />
Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
<Image img={require('../../img/msft_enterprise_select_group.png')} style={{ width: '800px', height: 'auto' }} />
### 3.2 Sign in to LiteLLM UI via SSO
Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
<Image img={require('../../img/msft_sso_sign_in.png')} style={{ width: '800px', height: 'auto' }} />
### 3.3 Check the new team on LiteLLM UI
On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM.
<Image img={require('../../img/msft_auto_team.png')} style={{ width: '900px', height: 'auto' }} />

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 268 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 999 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 244 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 380 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 231 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 261 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 413 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

View file

@ -12455,9 +12455,10 @@
}
},
"node_modules/http-proxy-middleware": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz",
"integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==",
"version": "2.0.9",
"resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz",
"integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==",
"license": "MIT",
"dependencies": {
"@types/http-proxy": "^1.17.8",
"http-proxy": "^1.18.1",

View file

@ -38,7 +38,7 @@ hide_table_of_contents: false
2. OpenAI Moderations - `omni-moderation-latest` support. [Start Here](https://docs.litellm.ai/docs/moderation)
3. Azure O1 - fake streaming support. This ensures if a `stream=true` is passed, the response is streamed. [Start Here](https://docs.litellm.ai/docs/providers/azure)
4. Anthropic - non-whitespace char stop sequence handling - [PR](https://github.com/BerriAI/litellm/pull/7484)
5. Azure OpenAI - support Entra id username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entrata-id---use-tenant_id-client_id-client_secret)
5. Azure OpenAI - support Entra ID username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret)
6. LM Studio - embedding route support. [Start Here](https://docs.litellm.ai/docs/providers/lm-studio)
7. WatsonX - ZenAPIKeyAuth support. [Start Here](https://docs.litellm.ai/docs/providers/watsonx)

View file

@ -0,0 +1,153 @@
---
title: v1.67.0-stable - SCIM Integration
slug: v1.67.0-stable
date: 2025-04-19T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: ["sso", "unified_file_id", "cost_tracking", "security"]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
## Key Highlights
- **SCIM Integration**: Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning
- **Team and Tag based usage tracking**: You can now see usage and spend by team and tag at 1M+ spend logs.
- **Unified Responses API**: Support for calling Anthropic, Gemini, Groq, etc. via OpenAI's new Responses API.
Let's dive in.
## SCIM Integration
<Image img={require('../../img/scim_integration.png')}/>
This release adds SCIM support to LiteLLM. This allows your SSO provider (Okta, Azure AD, etc) to automatically create/delete users, teams, and memberships on LiteLLM. This means that when you remove a team on your SSO provider, your SSO provider will automatically delete the corresponding team on LiteLLM.
[Read more](../../docs/tutorials/scim_litellm)
## Team and Tag based usage tracking
<Image img={require('../../img/release_notes/new_team_usage_highlight.jpg')}/>
This release improves team and tag based usage tracking at 1m+ spend logs, making it easy to monitor your LLM API Spend in production. This covers:
- View **daily spend** by teams + tags
- View **usage / spend by key**, within teams
- View **spend by multiple tags**
- Allow **internal users** to view spend of teams they're a member of
[Read more](#management-endpoints--ui)
## Unified Responses API
This release allows you to call Azure OpenAI, Anthropic, AWS Bedrock, and Google Vertex AI models via the POST /v1/responses endpoint on LiteLLM. This means you can now use popular tools like [OpenAI Codex](https://docs.litellm.ai/docs/tutorials/openai_codex) with your own models.
<Image img={require('../../img/release_notes/unified_responses_api_rn.png')}/>
[Read more](https://docs.litellm.ai/docs/response_api)
## New Models / Updated Models
- **OpenAI**
1. gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing - [Get Started](../../docs/providers/openai#usage), [PR](https://github.com/BerriAI/litellm/pull/9990)
2. o4 - correctly map o4 to openai o_series model
- **Azure AI**
1. Phi-4 output cost per token fix - [PR](https://github.com/BerriAI/litellm/pull/9880)
2. Responses API support [Get Started](../../docs/providers/azure#azure-responses-api),[PR](https://github.com/BerriAI/litellm/pull/10116)
- **Anthropic**
1. redacted message thinking support - [Get Started](../../docs/providers/anthropic#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10129)
- **Cohere**
1. `/v2/chat` Passthrough endpoint support w/ cost tracking - [Get Started](../../docs/pass_through/cohere), [PR](https://github.com/BerriAI/litellm/pull/9997)
- **Azure**
1. Support azure tenant_id/client_id env vars - [Get Started](../../docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret), [PR](https://github.com/BerriAI/litellm/pull/9993)
2. Fix response_format check for 2025+ api versions - [PR](https://github.com/BerriAI/litellm/pull/9993)
3. Add gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing
- **VLLM**
1. Files - Support 'file' message type for VLLM video url's - [Get Started](../../docs/providers/vllm#send-video-url-to-vllm), [PR](https://github.com/BerriAI/litellm/pull/10129)
2. Passthrough - new `/vllm/` passthrough endpoint support [Get Started](../../docs/pass_through/vllm), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **Mistral**
1. new `/mistral` passthrough endpoint support [Get Started](../../docs/pass_through/mistral), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **AWS**
1. New mapped bedrock regions - [PR](https://github.com/BerriAI/litellm/pull/9430)
- **VertexAI / Google AI Studio**
1. Gemini - Response format - Retain schema field ordering for google gemini and vertex by specifying propertyOrdering - [Get Started](../../docs/providers/vertex#json-schema), [PR](https://github.com/BerriAI/litellm/pull/9828)
2. Gemini-2.5-flash - return reasoning content [Google AI Studio](../../docs/providers/gemini#usage---thinking--reasoning_content), [Vertex AI](../../docs/providers/vertex#thinking--reasoning_content)
3. Gemini-2.5-flash - pricing + model information [PR](https://github.com/BerriAI/litellm/pull/10125)
4. Passthrough - new `/vertex_ai/discovery` route - enables calling AgentBuilder API routes [Get Started](../../docs/pass_through/vertex_ai#supported-api-endpoints), [PR](https://github.com/BerriAI/litellm/pull/10084)
- **Fireworks AI**
1. return tool calling responses in `tool_calls` field (fireworks incorrectly returns this as a json str in content) [PR](https://github.com/BerriAI/litellm/pull/10130)
- **Triton**
1. Remove fixed remove bad_words / stop words from `/generate` call - [Get Started](../../docs/providers/triton-inference-server#triton-generate---chat-completion), [PR](https://github.com/BerriAI/litellm/pull/10163)
- **Other**
1. Support for all litellm providers on Responses API (works with Codex) - [Get Started](../../docs/tutorials/openai_codex), [PR](https://github.com/BerriAI/litellm/pull/10132)
2. Fix combining multiple tool calls in streaming response - [Get Started](../../docs/completion/stream#helper-function), [PR](https://github.com/BerriAI/litellm/pull/10040)
## Spend Tracking Improvements
- **Cost Control** - inject cache control points in prompt for cost reduction [Get Started](../../docs/tutorials/prompt_caching), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Spend Tags** - spend tags in headers - support x-litellm-tags even if tag based routing not enabled [Get Started](../../docs/proxy/request_headers#litellm-headers), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Gemini-2.5-flash** - support cost calculation for reasoning tokens [PR](https://github.com/BerriAI/litellm/pull/10141)
## Management Endpoints / UI
- **Users**
1. Show created_at and updated_at on users page - [PR](https://github.com/BerriAI/litellm/pull/10033)
- **Virtual Keys**
1. Filter by key alias - https://github.com/BerriAI/litellm/pull/10085
- **Usage Tab**
1. Team based usage
- New `LiteLLM_DailyTeamSpend` Table for aggregate team based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10039)
- New Team based usage dashboard + new `/team/daily/activity` API - [PR](https://github.com/BerriAI/litellm/pull/10081)
- Return team alias on /team/daily/activity API - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow internal user view spend for teams they belong to - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by team - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_team_usage.png')}/>
2. Tag Based Usage
- New `LiteLLM_DailyTagSpend` Table for aggregate tag based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10071)
- Restrict to only Proxy Admins - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by tag
- Return tags passed in request (i.e. dynamic tags) on `/tag/list` API - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_tag_usage.png')}/>
3. Track prompt caching metrics in daily user, team, tag tables - [PR](https://github.com/BerriAI/litellm/pull/10029)
4. Show usage by key (on all up, team, and tag usage dashboards) - [PR](https://github.com/BerriAI/litellm/pull/10157)
5. swap old usage with new usage tab
- **Models**
1. Make columns resizable/hideable - [PR](https://github.com/BerriAI/litellm/pull/10119)
- **API Playground**
1. Allow internal user to call api playground - [PR](https://github.com/BerriAI/litellm/pull/10157)
- **SCIM**
1. Add LiteLLM SCIM Integration for Team and User management - [Get Started](../../docs/tutorials/scim_litellm), [PR](https://github.com/BerriAI/litellm/pull/10072)
## Logging / Guardrail Integrations
- **GCS**
1. Fix gcs pub sub logging with env var GCS_PROJECT_ID - [Get Started](../../docs/observability/gcs_bucket_integration#usage), [PR](https://github.com/BerriAI/litellm/pull/10042)
- **AIM**
1. Add litellm call id passing to Aim guardrails on pre and post-hooks calls - [Get Started](../../docs/proxy/guardrails/aim_security), [PR](https://github.com/BerriAI/litellm/pull/10021)
- **Azure blob storage**
1. Ensure logging works in high throughput scenarios - [Get Started](../../docs/proxy/logging#azure-blob-storage), [PR](https://github.com/BerriAI/litellm/pull/9962)
## General Proxy Improvements
- **Support setting `litellm.modify_params` via env var** [PR](https://github.com/BerriAI/litellm/pull/9964)
- **Model Discovery** - Check providers `/models` endpoints when calling proxys `/v1/models` endpoint - [Get Started](../../docs/proxy/model_discovery), [PR](https://github.com/BerriAI/litellm/pull/9958)
- **`/utils/token_counter`** - fix retrieving custom tokenizer for db models - [Get Started](../../docs/proxy/configs#set-custom-tokenizer), [PR](https://github.com/BerriAI/litellm/pull/10047)
- **Prisma migrate** - handle existing columns in db table - [PR](https://github.com/BerriAI/litellm/pull/10138)

View file

@ -69,6 +69,7 @@ const sidebars = {
"proxy/clientside_auth",
"proxy/request_headers",
"proxy/response_headers",
"proxy/model_discovery",
],
},
{
@ -101,6 +102,7 @@ const sidebars = {
"proxy/admin_ui_sso",
"proxy/self_serve",
"proxy/public_teams",
"tutorials/scim_litellm",
"proxy/custom_sso",
"proxy/ui_credentials",
"proxy/ui_logs"
@ -188,7 +190,7 @@ const sidebars = {
"providers/azure_ai",
"providers/aiml",
"providers/vertex",
{
type: "category",
label: "Google AI Studio",
@ -330,6 +332,8 @@ const sidebars = {
"pass_through/vertex_ai",
"pass_through/google_ai_studio",
"pass_through/cohere",
"pass_through/vllm",
"pass_through/mistral",
"pass_through/openai_passthrough",
"pass_through/anthropic_completion",
"pass_through/bedrock",
@ -340,7 +344,7 @@ const sidebars = {
},
"rerank",
"assistants",
{
type: "category",
label: "/files",
@ -407,9 +411,10 @@ const sidebars = {
type: "category",
label: "Logging & Observability",
items: [
"observability/agentops_integration",
"observability/langfuse_integration",
"observability/lunary_integration",
"observability/mlflow",
"observability/langfuse_integration",
"observability/gcs_bucket_integration",
"observability/langsmith_integration",
"observability/literalai_integration",
@ -443,7 +448,9 @@ const sidebars = {
label: "Tutorials",
items: [
"tutorials/openweb_ui",
"tutorials/openai_codex",
"tutorials/msft_sso",
"tutorials/prompt_caching",
"tutorials/tag_management",
'tutorials/litellm_proxy_aporia',
{

Binary file not shown.

View file

@ -0,0 +1,4 @@
-- AlterTable
ALTER TABLE "LiteLLM_DailyUserSpend" ADD COLUMN "cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
ADD COLUMN "cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0;

View file

@ -0,0 +1,36 @@
-- CreateTable
CREATE TABLE "LiteLLM_DailyTeamSpend" (
"id" TEXT NOT NULL,
"team_id" TEXT NOT NULL,
"date" TEXT NOT NULL,
"api_key" TEXT NOT NULL,
"model" TEXT NOT NULL,
"model_group" TEXT,
"custom_llm_provider" TEXT,
"prompt_tokens" INTEGER NOT NULL DEFAULT 0,
"completion_tokens" INTEGER NOT NULL DEFAULT 0,
"spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
"api_requests" INTEGER NOT NULL DEFAULT 0,
"successful_requests" INTEGER NOT NULL DEFAULT 0,
"failed_requests" INTEGER NOT NULL DEFAULT 0,
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
CONSTRAINT "LiteLLM_DailyTeamSpend_pkey" PRIMARY KEY ("id")
);
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTeamSpend_date_idx" ON "LiteLLM_DailyTeamSpend"("date");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTeamSpend_team_id_idx" ON "LiteLLM_DailyTeamSpend"("team_id");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTeamSpend_api_key_idx" ON "LiteLLM_DailyTeamSpend"("api_key");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTeamSpend_model_idx" ON "LiteLLM_DailyTeamSpend"("model");
-- CreateIndex
CREATE UNIQUE INDEX "LiteLLM_DailyTeamSpend_team_id_date_api_key_model_custom_ll_key" ON "LiteLLM_DailyTeamSpend"("team_id", "date", "api_key", "model", "custom_llm_provider");

View file

@ -0,0 +1,45 @@
-- AlterTable
ALTER TABLE "LiteLLM_DailyTeamSpend" ADD COLUMN "cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
ADD COLUMN "cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0;
-- CreateTable
CREATE TABLE "LiteLLM_DailyTagSpend" (
"id" TEXT NOT NULL,
"tag" TEXT NOT NULL,
"date" TEXT NOT NULL,
"api_key" TEXT NOT NULL,
"model" TEXT NOT NULL,
"model_group" TEXT,
"custom_llm_provider" TEXT,
"prompt_tokens" INTEGER NOT NULL DEFAULT 0,
"completion_tokens" INTEGER NOT NULL DEFAULT 0,
"cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0,
"cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
"spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
"api_requests" INTEGER NOT NULL DEFAULT 0,
"successful_requests" INTEGER NOT NULL DEFAULT 0,
"failed_requests" INTEGER NOT NULL DEFAULT 0,
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
CONSTRAINT "LiteLLM_DailyTagSpend_pkey" PRIMARY KEY ("id")
);
-- CreateIndex
CREATE UNIQUE INDEX "LiteLLM_DailyTagSpend_tag_key" ON "LiteLLM_DailyTagSpend"("tag");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTagSpend_date_idx" ON "LiteLLM_DailyTagSpend"("date");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTagSpend_tag_idx" ON "LiteLLM_DailyTagSpend"("tag");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTagSpend_api_key_idx" ON "LiteLLM_DailyTagSpend"("api_key");
-- CreateIndex
CREATE INDEX "LiteLLM_DailyTagSpend_model_idx" ON "LiteLLM_DailyTagSpend"("model");
-- CreateIndex
CREATE UNIQUE INDEX "LiteLLM_DailyTagSpend_tag_date_api_key_model_custom_llm_pro_key" ON "LiteLLM_DailyTagSpend"("tag", "date", "api_key", "model", "custom_llm_provider");

View file

@ -0,0 +1,3 @@
-- DropIndex
DROP INDEX "LiteLLM_DailyTagSpend_tag_key";

View file

@ -0,0 +1,3 @@
-- AlterTable
ALTER TABLE "LiteLLM_VerificationToken" ADD COLUMN "allowed_routes" TEXT[] DEFAULT ARRAY[]::TEXT[];

View file

@ -169,6 +169,7 @@ model LiteLLM_VerificationToken {
budget_duration String?
budget_reset_at DateTime?
allowed_cache_controls String[] @default([])
allowed_routes String[] @default([])
model_spend Json @default("{}")
model_max_budget Json @default("{}")
budget_id String?
@ -326,6 +327,8 @@ model LiteLLM_DailyUserSpend {
custom_llm_provider String?
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
cache_read_input_tokens Int @default(0)
cache_creation_input_tokens Int @default(0)
spend Float @default(0.0)
api_requests Int @default(0)
successful_requests Int @default(0)
@ -340,6 +343,60 @@ model LiteLLM_DailyUserSpend {
@@index([model])
}
// Track daily team spend metrics per model and key
model LiteLLM_DailyTeamSpend {
id String @id @default(uuid())
team_id String
date String
api_key String
model String
model_group String?
custom_llm_provider String?
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
cache_read_input_tokens Int @default(0)
cache_creation_input_tokens Int @default(0)
spend Float @default(0.0)
api_requests Int @default(0)
successful_requests Int @default(0)
failed_requests Int @default(0)
created_at DateTime @default(now())
updated_at DateTime @updatedAt
@@unique([team_id, date, api_key, model, custom_llm_provider])
@@index([date])
@@index([team_id])
@@index([api_key])
@@index([model])
}
// Track daily team spend metrics per model and key
model LiteLLM_DailyTagSpend {
id String @id @default(uuid())
tag String
date String
api_key String
model String
model_group String?
custom_llm_provider String?
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
cache_read_input_tokens Int @default(0)
cache_creation_input_tokens Int @default(0)
spend Float @default(0.0)
api_requests Int @default(0)
successful_requests Int @default(0)
failed_requests Int @default(0)
created_at DateTime @default(now())
updated_at DateTime @updatedAt
@@unique([tag, date, api_key, model, custom_llm_provider])
@@index([date])
@@index([tag])
@@index([api_key])
@@index([model])
}
// Track the status of cron jobs running. Only allow one pod to run the job at a time
model LiteLLM_CronJob {

View file

@ -1,6 +1,7 @@
import glob
import os
import random
import re
import subprocess
import time
from pathlib import Path
@ -82,6 +83,26 @@ class ProxyExtrasDBManager:
logger.info(f"Found {len(migration_paths)} migrations at {migrations_dir}")
return [Path(p).parent.name for p in migration_paths]
@staticmethod
def _roll_back_migration(migration_name: str):
"""Mark a specific migration as rolled back"""
subprocess.run(
["prisma", "migrate", "resolve", "--rolled-back", migration_name],
timeout=60,
check=True,
capture_output=True,
)
@staticmethod
def _resolve_specific_migration(migration_name: str):
"""Mark a specific migration as applied"""
subprocess.run(
["prisma", "migrate", "resolve", "--applied", migration_name],
timeout=60,
check=True,
capture_output=True,
)
@staticmethod
def _resolve_all_migrations(migrations_dir: str):
"""Mark all existing migrations as applied"""
@ -141,7 +162,34 @@ class ProxyExtrasDBManager:
return True
except subprocess.CalledProcessError as e:
logger.info(f"prisma db error: {e.stderr}, e: {e.stdout}")
if (
if "P3009" in e.stderr:
# Extract the failed migration name from the error message
migration_match = re.search(
r"`(\d+_.*)` migration", e.stderr
)
if migration_match:
failed_migration = migration_match.group(1)
logger.info(
f"Found failed migration: {failed_migration}, marking as rolled back"
)
# Mark the failed migration as rolled back
subprocess.run(
[
"prisma",
"migrate",
"resolve",
"--rolled-back",
failed_migration,
],
timeout=60,
check=True,
capture_output=True,
text=True,
)
logger.info(
f"✅ Migration {failed_migration} marked as rolled back... retrying"
)
elif (
"P3005" in e.stderr
and "database schema is not empty" in e.stderr
):
@ -155,6 +203,29 @@ class ProxyExtrasDBManager:
ProxyExtrasDBManager._resolve_all_migrations(migrations_dir)
logger.info("✅ All migrations resolved.")
return True
elif (
"P3018" in e.stderr
): # PostgreSQL error code for duplicate column
logger.info(
"Migration already exists, resolving specific migration"
)
# Extract the migration name from the error message
migration_match = re.search(
r"Migration name: (\d+_.*)", e.stderr
)
if migration_match:
migration_name = migration_match.group(1)
logger.info(f"Rolling back migration {migration_name}")
ProxyExtrasDBManager._roll_back_migration(
migration_name
)
logger.info(
f"Resolving migration {migration_name} that failed due to existing columns"
)
ProxyExtrasDBManager._resolve_specific_migration(
migration_name
)
logger.info("✅ Migration resolved.")
else:
# Use prisma db push with increased timeout
subprocess.run(

View file

@ -1,7 +1,7 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
package = []
[metadata]
lock-version = "2.0"
lock-version = "2.1"
python-versions = ">=3.8.1,<4.0, !=3.9.7"
content-hash = "2cf39473e67ff0615f0a61c9d2ac9f02b38cc08cbb1bdb893d89bee002646623"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm-proxy-extras"
version = "0.1.7"
version = "0.1.11"
description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
authors = ["BerriAI"]
readme = "README.md"
@ -22,7 +22,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "0.1.7"
version = "0.1.11"
version_files = [
"pyproject.toml:version",
"../requirements.txt:litellm-proxy-extras==",

View file

@ -113,6 +113,8 @@ _custom_logger_compatible_callbacks_literal = Literal[
"pagerduty",
"humanloop",
"gcs_pubsub",
"agentops",
"anthropic_cache_control_hook",
]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
_known_custom_logger_compatible_callbacks: List = list(
@ -162,7 +164,7 @@ token: Optional[str] = (
telemetry = True
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
modify_params = False
modify_params = bool(os.getenv("LITELLM_MODIFY_PARAMS", False))
retry = True
### AUTH ###
api_key: Optional[str] = None
@ -324,6 +326,7 @@ from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
model_cost = get_model_cost_map(url=model_cost_map_url)
custom_prompt_dict: Dict[str, dict] = {}
check_provider_endpoint = False
####### THREAD-SPECIFIC DATA ####################
@ -413,6 +416,7 @@ deepseek_models: List = []
azure_ai_models: List = []
jina_ai_models: List = []
voyage_models: List = []
infinity_models: List = []
databricks_models: List = []
cloudflare_models: List = []
codestral_models: List = []
@ -554,6 +558,8 @@ def add_known_models():
azure_ai_models.append(key)
elif value.get("litellm_provider") == "voyage":
voyage_models.append(key)
elif value.get("litellm_provider") == "infinity":
infinity_models.append(key)
elif value.get("litellm_provider") == "databricks":
databricks_models.append(key)
elif value.get("litellm_provider") == "cloudflare":
@ -642,6 +648,7 @@ model_list = (
+ deepseek_models
+ azure_ai_models
+ voyage_models
+ infinity_models
+ databricks_models
+ cloudflare_models
+ codestral_models
@ -697,6 +704,7 @@ models_by_provider: dict = {
"mistral": mistral_chat_models,
"azure_ai": azure_ai_models,
"voyage": voyage_models,
"infinity": infinity_models,
"databricks": databricks_models,
"cloudflare": cloudflare_models,
"codestral": codestral_models,
@ -944,9 +952,11 @@ from .llms.topaz.image_variations.transformation import TopazImageVariationConfi
from litellm.llms.openai.completion.transformation import OpenAITextCompletionConfig
from .llms.groq.chat.transformation import GroqChatConfig
from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
from .llms.infinity.embedding.transformation import InfinityEmbeddingConfig
from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
from .llms.mistral.mistral_chat_transformation import MistralConfig
from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from .llms.azure.responses.transformation import AzureOpenAIResponsesAPIConfig
from .llms.openai.chat.o_series_transformation import (
OpenAIOSeriesConfig as OpenAIO1Config, # maintain backwards compatibility
OpenAIOSeriesConfig,

View file

@ -304,6 +304,11 @@ def create_assistants(
"response_format": response_format,
}
# only send params that are not None
create_assistant_data = {
k: v for k, v in create_assistant_data.items() if v is not None
}
response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None
if custom_llm_provider == "openai":
api_base = (

View file

@ -21,9 +21,18 @@ DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
########## Networking constants ##############################################################
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour
########### v2 Architecture constants for managing writing updates to the database ###########
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
REDIS_DAILY_TEAM_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_team_spend_update_buffer"
REDIS_DAILY_TAG_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_tag_spend_update_buffer"
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
MAX_SIZE_IN_MEMORY_QUEUE = 10000
MAX_IN_MEMORY_QUEUE_FLUSH_COUNT = 1000

View file

@ -0,0 +1,3 @@
from .agentops import AgentOps
__all__ = ["AgentOps"]

View file

@ -0,0 +1,118 @@
"""
AgentOps integration for LiteLLM - Provides OpenTelemetry tracing for LLM calls
"""
import os
from dataclasses import dataclass
from typing import Optional, Dict, Any
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@dataclass
class AgentOpsConfig:
endpoint: str = "https://otlp.agentops.cloud/v1/traces"
api_key: Optional[str] = None
service_name: Optional[str] = None
deployment_environment: Optional[str] = None
auth_endpoint: str = "https://api.agentops.ai/v3/auth/token"
@classmethod
def from_env(cls):
return cls(
endpoint="https://otlp.agentops.cloud/v1/traces",
api_key=os.getenv("AGENTOPS_API_KEY"),
service_name=os.getenv("AGENTOPS_SERVICE_NAME", "agentops"),
deployment_environment=os.getenv("AGENTOPS_ENVIRONMENT", "production"),
auth_endpoint="https://api.agentops.ai/v3/auth/token"
)
class AgentOps(OpenTelemetry):
"""
AgentOps integration - built on top of OpenTelemetry
Example usage:
```python
import litellm
litellm.success_callback = ["agentops"]
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
"""
def __init__(
self,
config: Optional[AgentOpsConfig] = None,
):
if config is None:
config = AgentOpsConfig.from_env()
# Prefetch JWT token for authentication
jwt_token = None
project_id = None
if config.api_key:
try:
response = self._fetch_auth_token(config.api_key, config.auth_endpoint)
jwt_token = response.get("token")
project_id = response.get("project_id")
except Exception:
pass
headers = f"Authorization=Bearer {jwt_token}" if jwt_token else None
otel_config = OpenTelemetryConfig(
exporter="otlp_http",
endpoint=config.endpoint,
headers=headers
)
# Initialize OpenTelemetry with our config
super().__init__(
config=otel_config,
callback_name="agentops"
)
# Set AgentOps-specific resource attributes
resource_attrs = {
"service.name": config.service_name or "litellm",
"deployment.environment": config.deployment_environment or "production",
"telemetry.sdk.name": "agentops",
}
if project_id:
resource_attrs["project.id"] = project_id
self.resource_attributes = resource_attrs
def _fetch_auth_token(self, api_key: str, auth_endpoint: str) -> Dict[str, Any]:
"""
Fetch JWT authentication token from AgentOps API
Args:
api_key: AgentOps API key
auth_endpoint: Authentication endpoint
Returns:
Dict containing JWT token and project ID
"""
headers = {
"Content-Type": "application/json",
"Connection": "keep-alive",
}
client = _get_httpx_client()
try:
response = client.post(
url=auth_endpoint,
headers=headers,
json={"api_key": api_key},
timeout=10
)
if response.status_code != 200:
raise Exception(f"Failed to fetch auth token: {response.text}")
return response.json()
finally:
client.close()

View file

@ -0,0 +1,150 @@
"""
This hook is used to inject cache control directives into the messages of a chat completion.
Users can define
- `cache_control_injection_points` in the completion params and litellm will inject the cache control directives into the messages at the specified injection points.
"""
import copy
from typing import Dict, List, Optional, Tuple, Union, cast
from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.custom_prompt_management import CustomPromptManagement
from litellm.types.integrations.anthropic_cache_control_hook import (
CacheControlInjectionPoint,
CacheControlMessageInjectionPoint,
)
from litellm.types.llms.openai import AllMessageValues, ChatCompletionCachedContent
from litellm.types.utils import StandardCallbackDynamicParams
class AnthropicCacheControlHook(CustomPromptManagement):
def get_chat_completion_prompt(
self,
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict]:
"""
Apply cache control directives based on specified injection points.
Returns:
- model: str - the model to use
- messages: List[AllMessageValues] - messages with applied cache controls
- non_default_params: dict - params with any global cache controls
"""
# Extract cache control injection points
injection_points: List[CacheControlInjectionPoint] = non_default_params.pop(
"cache_control_injection_points", []
)
if not injection_points:
return model, messages, non_default_params
# Create a deep copy of messages to avoid modifying the original list
processed_messages = copy.deepcopy(messages)
# Process message-level cache controls
for point in injection_points:
if point.get("location") == "message":
point = cast(CacheControlMessageInjectionPoint, point)
processed_messages = self._process_message_injection(
point=point, messages=processed_messages
)
return model, processed_messages, non_default_params
@staticmethod
def _process_message_injection(
point: CacheControlMessageInjectionPoint, messages: List[AllMessageValues]
) -> List[AllMessageValues]:
"""Process message-level cache control injection."""
control: ChatCompletionCachedContent = point.get(
"control", None
) or ChatCompletionCachedContent(type="ephemeral")
_targetted_index: Optional[Union[int, str]] = point.get("index", None)
targetted_index: Optional[int] = None
if isinstance(_targetted_index, str):
if _targetted_index.isdigit():
targetted_index = int(_targetted_index)
else:
targetted_index = _targetted_index
targetted_role = point.get("role", None)
# Case 1: Target by specific index
if targetted_index is not None:
if 0 <= targetted_index < len(messages):
messages[targetted_index] = (
AnthropicCacheControlHook._safe_insert_cache_control_in_message(
messages[targetted_index], control
)
)
# Case 2: Target by role
elif targetted_role is not None:
for msg in messages:
if msg.get("role") == targetted_role:
msg = (
AnthropicCacheControlHook._safe_insert_cache_control_in_message(
message=msg, control=control
)
)
return messages
@staticmethod
def _safe_insert_cache_control_in_message(
message: AllMessageValues, control: ChatCompletionCachedContent
) -> AllMessageValues:
"""
Safe way to insert cache control in a message
OpenAI Message content can be either:
- string
- list of objects
This method handles inserting cache control in both cases.
"""
message_content = message.get("content", None)
# 1. if string, insert cache control in the message
if isinstance(message_content, str):
message["cache_control"] = control # type: ignore
# 2. list of objects
elif isinstance(message_content, list):
for content_item in message_content:
if isinstance(content_item, dict):
content_item["cache_control"] = control # type: ignore
return message
@property
def integration_name(self) -> str:
"""Return the integration name for this hook."""
return "anthropic_cache_control_hook"
@staticmethod
def should_use_anthropic_cache_control_hook(non_default_params: Dict) -> bool:
if non_default_params.get("cache_control_injection_points", None):
return True
return False
@staticmethod
def get_custom_logger_for_anthropic_cache_control_hook(
non_default_params: Dict,
) -> Optional[CustomLogger]:
from litellm.litellm_core_utils.litellm_logging import (
_init_custom_logger_compatible_class,
)
if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
non_default_params
):
return _init_custom_logger_compatible_class(
logging_integration="anthropic_cache_control_hook",
internal_usage_cache=None,
llm_router=None,
)
return None

View file

@ -1,14 +1,15 @@
import asyncio
import json
import os
import time
import uuid
from datetime import datetime, timedelta
from typing import List, Optional
from litellm._logging import verbose_logger
from litellm.constants import AZURE_STORAGE_MSFT_VERSION
from litellm.constants import _DEFAULT_TTL_FOR_HTTPX_CLIENTS, AZURE_STORAGE_MSFT_VERSION
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.llms.azure.common_utils import get_azure_ad_token_from_entrata_id
from litellm.llms.azure.common_utils import get_azure_ad_token_from_entra_id
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
get_async_httpx_client,
@ -48,14 +49,17 @@ class AzureBlobStorageLogger(CustomBatchLogger):
"Missing required environment variable: AZURE_STORAGE_FILE_SYSTEM"
)
self.azure_storage_file_system: str = _azure_storage_file_system
self._service_client = None
# Time that the azure service client expires, in order to reset the connection pool and keep it fresh
self._service_client_timeout: Optional[float] = None
# Internal variables used for Token based authentication
self.azure_auth_token: Optional[
str
] = None # the Azure AD token to use for Azure Storage API requests
self.token_expiry: Optional[
datetime
] = None # the expiry time of the currentAzure AD token
self.azure_auth_token: Optional[str] = (
None # the Azure AD token to use for Azure Storage API requests
)
self.token_expiry: Optional[datetime] = (
None # the expiry time of the currentAzure AD token
)
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
@ -291,7 +295,7 @@ class AzureBlobStorageLogger(CustomBatchLogger):
"Missing required environment variable: AZURE_STORAGE_CLIENT_SECRET"
)
token_provider = get_azure_ad_token_from_entrata_id(
token_provider = get_azure_ad_token_from_entra_id(
tenant_id=tenant_id,
client_id=client_id,
client_secret=client_secret,
@ -324,6 +328,25 @@ class AzureBlobStorageLogger(CustomBatchLogger):
f"AzureBlobStorageLogger is only available for premium users. {CommonProxyErrors.not_premium_user}"
)
async def get_service_client(self):
from azure.storage.filedatalake.aio import DataLakeServiceClient
# expire old clients to recover from connection issues
if (
self._service_client_timeout
and self._service_client
and self._service_client_timeout > time.time()
):
await self._service_client.close()
self._service_client = None
if not self._service_client:
self._service_client = DataLakeServiceClient(
account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
credential=self.azure_storage_account_key,
)
self._service_client_timeout = time.time() + _DEFAULT_TTL_FOR_HTTPX_CLIENTS
return self._service_client
async def upload_to_azure_data_lake_with_azure_account_key(
self, payload: StandardLoggingPayload
):
@ -332,13 +355,10 @@ class AzureBlobStorageLogger(CustomBatchLogger):
This is used when Azure Storage Account Key is set - Azure Storage Account Key does not work directly with Azure Rest API
"""
from azure.storage.filedatalake.aio import DataLakeServiceClient
# Create an async service client
service_client = DataLakeServiceClient(
account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
credential=self.azure_storage_account_key,
)
service_client = await self.get_service_client()
# Get file system client
file_system_client = service_client.get_file_system_client(
file_system=self.azure_storage_file_system

View file

@ -94,7 +94,7 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict]:

View file

@ -15,7 +15,7 @@ class CustomPromptManagement(CustomLogger, PromptManagementBase):
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict]:

View file

@ -75,7 +75,7 @@ class GcsPubSubLogger(CustomBatchLogger):
vertex_project,
) = await vertex_chat_completion._ensure_access_token_async(
credentials=self.path_service_account_json,
project_id=None,
project_id=self.project_id,
custom_llm_provider="vertex_ai",
)

View file

@ -152,14 +152,21 @@ class HumanloopLogger(CustomLogger):
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict,]:
) -> Tuple[
str,
List[AllMessageValues],
dict,
]:
humanloop_api_key = dynamic_callback_params.get(
"humanloop_api_key"
) or get_secret_str("HUMANLOOP_API_KEY")
if prompt_id is None:
raise ValueError("prompt_id is required for Humanloop integration")
if humanloop_api_key is None:
return super().get_chat_completion_prompt(
model=model,

View file

@ -169,10 +169,14 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict,]:
) -> Tuple[
str,
List[AllMessageValues],
dict,
]:
return self.get_chat_completion_prompt(
model,
messages,

View file

@ -79,10 +79,12 @@ class PromptManagementBase(ABC):
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
dynamic_callback_params: StandardCallbackDynamicParams,
) -> Tuple[str, List[AllMessageValues], dict,]:
) -> Tuple[str, List[AllMessageValues], dict]:
if prompt_id is None:
raise ValueError("prompt_id is required for Prompt Management Base class")
if not self.should_run_prompt_management(
prompt_id=prompt_id, dynamic_callback_params=dynamic_callback_params
):

View file

@ -221,6 +221,8 @@ def get_supported_openai_params( # noqa: PLR0915
return litellm.PredibaseConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "voyage":
return litellm.VoyageEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "infinity":
return litellm.InfinityEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "triton":
if request_type == "embeddings":
return litellm.TritonEmbeddingConfig().get_supported_openai_params(

View file

@ -28,6 +28,7 @@ from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
@ -36,6 +37,8 @@ from litellm.cost_calculator import (
RealtimeAPITokenUsageProcessor,
_select_model_name_for_cost_calc,
)
from litellm.integrations.agentops import AgentOps
from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
@ -249,9 +252,9 @@ class Logging(LiteLLMLoggingBaseClass):
self.litellm_trace_id = litellm_trace_id
self.function_id = function_id
self.streaming_chunks: List[Any] = [] # for generating complete stream response
self.sync_streaming_chunks: List[
Any
] = [] # for generating complete stream response
self.sync_streaming_chunks: List[Any] = (
[]
) # for generating complete stream response
self.log_raw_request_response = log_raw_request_response
# Initialize dynamic callbacks
@ -455,19 +458,38 @@ class Logging(LiteLLMLoggingBaseClass):
if "custom_llm_provider" in self.model_call_details:
self.custom_llm_provider = self.model_call_details["custom_llm_provider"]
def should_run_prompt_management_hooks(
self,
non_default_params: Dict,
prompt_id: Optional[str] = None,
) -> bool:
"""
Return True if prompt management hooks should be run
"""
if prompt_id:
return True
if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
non_default_params
):
return True
return False
def get_chat_completion_prompt(
self,
model: str,
messages: List[AllMessageValues],
non_default_params: dict,
prompt_id: str,
non_default_params: Dict,
prompt_id: Optional[str],
prompt_variables: Optional[dict],
prompt_management_logger: Optional[CustomLogger] = None,
) -> Tuple[str, List[AllMessageValues], dict]:
custom_logger = (
prompt_management_logger
or self.get_custom_logger_for_prompt_management(model)
or self.get_custom_logger_for_prompt_management(
model=model, non_default_params=non_default_params
)
)
if custom_logger:
(
model,
@ -476,7 +498,7 @@ class Logging(LiteLLMLoggingBaseClass):
) = custom_logger.get_chat_completion_prompt(
model=model,
messages=messages,
non_default_params=non_default_params,
non_default_params=non_default_params or {},
prompt_id=prompt_id,
prompt_variables=prompt_variables,
dynamic_callback_params=self.standard_callback_dynamic_params,
@ -485,7 +507,7 @@ class Logging(LiteLLMLoggingBaseClass):
return model, messages, non_default_params
def get_custom_logger_for_prompt_management(
self, model: str
self, model: str, non_default_params: Dict
) -> Optional[CustomLogger]:
"""
Get a custom logger for prompt management based on model name or available callbacks.
@ -520,6 +542,26 @@ class Logging(LiteLLMLoggingBaseClass):
self.model_call_details["prompt_integration"] = logger.__class__.__name__
return logger
if anthropic_cache_control_logger := AnthropicCacheControlHook.get_custom_logger_for_anthropic_cache_control_hook(
non_default_params
):
self.model_call_details["prompt_integration"] = (
anthropic_cache_control_logger.__class__.__name__
)
return anthropic_cache_control_logger
return None
def get_custom_logger_for_anthropic_cache_control_hook(
self, non_default_params: Dict
) -> Optional[CustomLogger]:
if non_default_params.get("cache_control_injection_points", None):
custom_logger = _init_custom_logger_compatible_class(
logging_integration="anthropic_cache_control_hook",
internal_usage_cache=None,
llm_router=None,
)
return custom_logger
return None
def _get_raw_request_body(self, data: Optional[Union[dict, str]]) -> dict:
@ -557,9 +599,9 @@ class Logging(LiteLLMLoggingBaseClass):
model
): # if model name was changes pre-call, overwrite the initial model call name with the new one
self.model_call_details["model"] = model
self.model_call_details["litellm_params"][
"api_base"
] = self._get_masked_api_base(additional_args.get("api_base", ""))
self.model_call_details["litellm_params"]["api_base"] = (
self._get_masked_api_base(additional_args.get("api_base", ""))
)
def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915
# Log the exact input to the LLM API
@ -588,10 +630,10 @@ class Logging(LiteLLMLoggingBaseClass):
try:
# [Non-blocking Extra Debug Information in metadata]
if turn_off_message_logging is True:
_metadata[
"raw_request"
] = "redacted by litellm. \
_metadata["raw_request"] = (
"redacted by litellm. \
'litellm.turn_off_message_logging=True'"
)
else:
curl_command = self._get_request_curl_command(
api_base=additional_args.get("api_base", ""),
@ -602,32 +644,32 @@ class Logging(LiteLLMLoggingBaseClass):
_metadata["raw_request"] = str(curl_command)
# split up, so it's easier to parse in the UI
self.model_call_details[
"raw_request_typed_dict"
] = RawRequestTypedDict(
raw_request_api_base=str(
additional_args.get("api_base") or ""
),
raw_request_body=self._get_raw_request_body(
additional_args.get("complete_input_dict", {})
),
raw_request_headers=self._get_masked_headers(
additional_args.get("headers", {}) or {},
ignore_sensitive_headers=True,
),
error=None,
self.model_call_details["raw_request_typed_dict"] = (
RawRequestTypedDict(
raw_request_api_base=str(
additional_args.get("api_base") or ""
),
raw_request_body=self._get_raw_request_body(
additional_args.get("complete_input_dict", {})
),
raw_request_headers=self._get_masked_headers(
additional_args.get("headers", {}) or {},
ignore_sensitive_headers=True,
),
error=None,
)
)
except Exception as e:
self.model_call_details[
"raw_request_typed_dict"
] = RawRequestTypedDict(
error=str(e),
self.model_call_details["raw_request_typed_dict"] = (
RawRequestTypedDict(
error=str(e),
)
)
_metadata[
"raw_request"
] = "Unable to Log \
_metadata["raw_request"] = (
"Unable to Log \
raw request: {}".format(
str(e)
str(e)
)
)
if self.logger_fn and callable(self.logger_fn):
try:
@ -957,9 +999,9 @@ class Logging(LiteLLMLoggingBaseClass):
verbose_logger.debug(
f"response_cost_failure_debug_information: {debug_info}"
)
self.model_call_details[
"response_cost_failure_debug_information"
] = debug_info
self.model_call_details["response_cost_failure_debug_information"] = (
debug_info
)
return None
try:
@ -984,9 +1026,9 @@ class Logging(LiteLLMLoggingBaseClass):
verbose_logger.debug(
f"response_cost_failure_debug_information: {debug_info}"
)
self.model_call_details[
"response_cost_failure_debug_information"
] = debug_info
self.model_call_details["response_cost_failure_debug_information"] = (
debug_info
)
return None
@ -1046,9 +1088,9 @@ class Logging(LiteLLMLoggingBaseClass):
end_time = datetime.datetime.now()
if self.completion_start_time is None:
self.completion_start_time = end_time
self.model_call_details[
"completion_start_time"
] = self.completion_start_time
self.model_call_details["completion_start_time"] = (
self.completion_start_time
)
self.model_call_details["log_event_type"] = "successful_api_call"
self.model_call_details["end_time"] = end_time
self.model_call_details["cache_hit"] = cache_hit
@ -1127,39 +1169,39 @@ class Logging(LiteLLMLoggingBaseClass):
"response_cost"
]
else:
self.model_call_details[
"response_cost"
] = self._response_cost_calculator(result=logging_result)
self.model_call_details["response_cost"] = (
self._response_cost_calculator(result=logging_result)
)
## STANDARDIZED LOGGING PAYLOAD
self.model_call_details[
"standard_logging_object"
] = get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=logging_result,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
self.model_call_details["standard_logging_object"] = (
get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=logging_result,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
elif isinstance(result, dict) or isinstance(result, list):
## STANDARDIZED LOGGING PAYLOAD
self.model_call_details[
"standard_logging_object"
] = get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=result,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
self.model_call_details["standard_logging_object"] = (
get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=result,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
elif standard_logging_object is not None:
self.model_call_details[
"standard_logging_object"
] = standard_logging_object
self.model_call_details["standard_logging_object"] = (
standard_logging_object
)
else: # streaming chunks + image gen.
self.model_call_details["response_cost"] = None
@ -1215,23 +1257,23 @@ class Logging(LiteLLMLoggingBaseClass):
verbose_logger.debug(
"Logging Details LiteLLM-Success Call streaming complete"
)
self.model_call_details[
"complete_streaming_response"
] = complete_streaming_response
self.model_call_details[
"response_cost"
] = self._response_cost_calculator(result=complete_streaming_response)
self.model_call_details["complete_streaming_response"] = (
complete_streaming_response
)
self.model_call_details["response_cost"] = (
self._response_cost_calculator(result=complete_streaming_response)
)
## STANDARDIZED LOGGING PAYLOAD
self.model_call_details[
"standard_logging_object"
] = get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=complete_streaming_response,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
self.model_call_details["standard_logging_object"] = (
get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=complete_streaming_response,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
callbacks = self.get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_success_callbacks,
@ -1580,10 +1622,10 @@ class Logging(LiteLLMLoggingBaseClass):
)
else:
if self.stream and complete_streaming_response:
self.model_call_details[
"complete_response"
] = self.model_call_details.get(
"complete_streaming_response", {}
self.model_call_details["complete_response"] = (
self.model_call_details.get(
"complete_streaming_response", {}
)
)
result = self.model_call_details["complete_response"]
openMeterLogger.log_success_event(
@ -1623,10 +1665,10 @@ class Logging(LiteLLMLoggingBaseClass):
)
else:
if self.stream and complete_streaming_response:
self.model_call_details[
"complete_response"
] = self.model_call_details.get(
"complete_streaming_response", {}
self.model_call_details["complete_response"] = (
self.model_call_details.get(
"complete_streaming_response", {}
)
)
result = self.model_call_details["complete_response"]
@ -1733,9 +1775,9 @@ class Logging(LiteLLMLoggingBaseClass):
if complete_streaming_response is not None:
print_verbose("Async success callbacks: Got a complete streaming response")
self.model_call_details[
"async_complete_streaming_response"
] = complete_streaming_response
self.model_call_details["async_complete_streaming_response"] = (
complete_streaming_response
)
try:
if self.model_call_details.get("cache_hit", False) is True:
self.model_call_details["response_cost"] = 0.0
@ -1745,10 +1787,10 @@ class Logging(LiteLLMLoggingBaseClass):
model_call_details=self.model_call_details
)
# base_model defaults to None if not set on model_info
self.model_call_details[
"response_cost"
] = self._response_cost_calculator(
result=complete_streaming_response
self.model_call_details["response_cost"] = (
self._response_cost_calculator(
result=complete_streaming_response
)
)
verbose_logger.debug(
@ -1761,16 +1803,16 @@ class Logging(LiteLLMLoggingBaseClass):
self.model_call_details["response_cost"] = None
## STANDARDIZED LOGGING PAYLOAD
self.model_call_details[
"standard_logging_object"
] = get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=complete_streaming_response,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
self.model_call_details["standard_logging_object"] = (
get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj=complete_streaming_response,
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="success",
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
callbacks = self.get_combined_callback_list(
dynamic_success_callbacks=self.dynamic_async_success_callbacks,
@ -1976,18 +2018,18 @@ class Logging(LiteLLMLoggingBaseClass):
## STANDARDIZED LOGGING PAYLOAD
self.model_call_details[
"standard_logging_object"
] = get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj={},
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="failure",
error_str=str(exception),
original_exception=exception,
standard_built_in_tools_params=self.standard_built_in_tools_params,
self.model_call_details["standard_logging_object"] = (
get_standard_logging_object_payload(
kwargs=self.model_call_details,
init_response_obj={},
start_time=start_time,
end_time=end_time,
logging_obj=self,
status="failure",
error_str=str(exception),
original_exception=exception,
standard_built_in_tools_params=self.standard_built_in_tools_params,
)
)
return start_time, end_time
@ -2645,7 +2687,15 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
"""
try:
custom_logger_init_args = custom_logger_init_args or {}
if logging_integration == "lago":
if logging_integration == "agentops": # Add AgentOps initialization
for callback in _in_memory_loggers:
if isinstance(callback, AgentOps):
return callback # type: ignore
agentops_logger = AgentOps()
_in_memory_loggers.append(agentops_logger)
return agentops_logger # type: ignore
elif logging_integration == "lago":
for callback in _in_memory_loggers:
if isinstance(callback, LagoLogger):
return callback # type: ignore
@ -2753,9 +2803,9 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
endpoint=arize_config.endpoint,
)
os.environ[
"OTEL_EXPORTER_OTLP_TRACES_HEADERS"
] = f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
)
for callback in _in_memory_loggers:
if (
isinstance(callback, ArizeLogger)
@ -2779,9 +2829,9 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
# auth can be disabled on local deployments of arize phoenix
if arize_phoenix_config.otlp_auth_headers is not None:
os.environ[
"OTEL_EXPORTER_OTLP_TRACES_HEADERS"
] = arize_phoenix_config.otlp_auth_headers
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
arize_phoenix_config.otlp_auth_headers
)
for callback in _in_memory_loggers:
if (
@ -2872,9 +2922,9 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
exporter="otlp_http",
endpoint="https://langtrace.ai/api/trace",
)
os.environ[
"OTEL_EXPORTER_OTLP_TRACES_HEADERS"
] = f"api_key={os.getenv('LANGTRACE_API_KEY')}"
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
f"api_key={os.getenv('LANGTRACE_API_KEY')}"
)
for callback in _in_memory_loggers:
if (
isinstance(callback, OpenTelemetry)
@ -2908,6 +2958,13 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
pagerduty_logger = PagerDutyAlerting(**custom_logger_init_args)
_in_memory_loggers.append(pagerduty_logger)
return pagerduty_logger # type: ignore
elif logging_integration == "anthropic_cache_control_hook":
for callback in _in_memory_loggers:
if isinstance(callback, AnthropicCacheControlHook):
return callback
anthropic_cache_control_hook = AnthropicCacheControlHook()
_in_memory_loggers.append(anthropic_cache_control_hook)
return anthropic_cache_control_hook # type: ignore
elif logging_integration == "gcs_pubsub":
for callback in _in_memory_loggers:
if isinstance(callback, GcsPubSubLogger):
@ -3046,6 +3103,10 @@ def get_custom_logger_compatible_class( # noqa: PLR0915
for callback in _in_memory_loggers:
if isinstance(callback, PagerDutyAlerting):
return callback
elif logging_integration == "anthropic_cache_control_hook":
for callback in _in_memory_loggers:
if isinstance(callback, AnthropicCacheControlHook):
return callback
elif logging_integration == "gcs_pubsub":
for callback in _in_memory_loggers:
if isinstance(callback, GcsPubSubLogger):
@ -3369,10 +3430,10 @@ class StandardLoggingPayloadSetup:
for key in StandardLoggingHiddenParams.__annotations__.keys():
if key in hidden_params:
if key == "additional_headers":
clean_hidden_params[
"additional_headers"
] = StandardLoggingPayloadSetup.get_additional_headers(
hidden_params[key]
clean_hidden_params["additional_headers"] = (
StandardLoggingPayloadSetup.get_additional_headers(
hidden_params[key]
)
)
else:
clean_hidden_params[key] = hidden_params[key] # type: ignore
@ -3651,7 +3712,7 @@ def emit_standard_logging_payload(payload: StandardLoggingPayload):
def get_standard_logging_metadata(
metadata: Optional[Dict[str, Any]]
metadata: Optional[Dict[str, Any]],
) -> StandardLoggingMetadata:
"""
Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
@ -3715,9 +3776,9 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]):
):
for k, v in metadata["user_api_key_metadata"].items():
if k == "logging": # prevent logging user logging keys
cleaned_user_api_key_metadata[
k
] = "scrubbed_by_litellm_for_sensitive_keys"
cleaned_user_api_key_metadata[k] = (
"scrubbed_by_litellm_for_sensitive_keys"
)
else:
cleaned_user_api_key_metadata[k] = v

View file

@ -265,8 +265,10 @@ def generic_cost_per_token(
)
## CALCULATE OUTPUT COST
text_tokens = usage.completion_tokens
text_tokens = 0
audio_tokens = 0
reasoning_tokens = 0
is_text_tokens_total = False
if usage.completion_tokens_details is not None:
audio_tokens = (
cast(
@ -280,9 +282,20 @@ def generic_cost_per_token(
Optional[int],
getattr(usage.completion_tokens_details, "text_tokens", None),
)
or usage.completion_tokens # default to completion tokens, if this field is not set
or 0 # default to completion tokens, if this field is not set
)
reasoning_tokens = (
cast(
Optional[int],
getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
)
or 0
)
if text_tokens == 0:
text_tokens = usage.completion_tokens
if text_tokens == usage.completion_tokens:
is_text_tokens_total = True
## TEXT COST
completion_cost = float(text_tokens) * completion_base_cost
@ -290,12 +303,26 @@ def generic_cost_per_token(
"output_cost_per_audio_token"
)
_output_cost_per_reasoning_token: Optional[float] = model_info.get(
"output_cost_per_reasoning_token"
)
## AUDIO COST
if (
_output_cost_per_audio_token is not None
and audio_tokens is not None
and audio_tokens > 0
):
if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0:
_output_cost_per_audio_token = (
_output_cost_per_audio_token
if _output_cost_per_audio_token is not None
else completion_base_cost
)
completion_cost += float(audio_tokens) * _output_cost_per_audio_token
## REASONING COST
if not is_text_tokens_total and reasoning_tokens and reasoning_tokens > 0:
_output_cost_per_reasoning_token = (
_output_cost_per_reasoning_token
if _output_cost_per_reasoning_token is not None
else completion_base_cost
)
completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
return prompt_cost, completion_cost

View file

@ -14,6 +14,7 @@ from litellm.types.llms.openai import ChatCompletionThinkingBlock
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall,
ChatCompletionRedactedThinkingBlock,
Choices,
Delta,
EmbeddingResponse,
@ -486,7 +487,14 @@ def convert_to_model_response_object( # noqa: PLR0915
)
# Handle thinking models that display `thinking_blocks` within `content`
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock,
ChatCompletionRedactedThinkingBlock,
]
]
] = None
if "thinking_blocks" in choice["message"]:
thinking_blocks = choice["message"]["thinking_blocks"]
provider_specific_fields["thinking_blocks"] = thinking_blocks

View file

@ -75,6 +75,10 @@ class ModelParamHelper:
combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs
@staticmethod
def get_litellm_provider_specific_params_for_chat_params() -> Set[str]:
return set(["thinking"])
@staticmethod
def _get_litellm_supported_chat_completion_kwargs() -> Set[str]:
"""
@ -82,11 +86,18 @@ class ModelParamHelper:
This follows the OpenAI API Spec
"""
all_chat_completion_kwargs = set(
non_streaming_params: Set[str] = set(
getattr(CompletionCreateParamsNonStreaming, "__annotations__", {}).keys()
).union(
set(getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys())
)
streaming_params: Set[str] = set(
getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys()
)
litellm_provider_specific_params: Set[str] = (
ModelParamHelper.get_litellm_provider_specific_params_for_chat_params()
)
all_chat_completion_kwargs: Set[str] = non_streaming_params.union(
streaming_params
).union(litellm_provider_specific_params)
return all_chat_completion_kwargs
@staticmethod

View file

@ -471,3 +471,59 @@ def unpack_defs(schema, defs):
unpack_defs(ref, defs)
value["items"] = ref
continue
def _get_image_mime_type_from_url(url: str) -> Optional[str]:
"""
Get mime type for common image URLs
See gemini mime types: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#image-requirements
Supported by Gemini:
application/pdf
audio/mpeg
audio/mp3
audio/wav
image/png
image/jpeg
image/webp
text/plain
video/mov
video/mpeg
video/mp4
video/mpg
video/avi
video/wmv
video/mpegps
video/flv
"""
url = url.lower()
# Map file extensions to mime types
mime_types = {
# Images
(".jpg", ".jpeg"): "image/jpeg",
(".png",): "image/png",
(".webp",): "image/webp",
# Videos
(".mp4",): "video/mp4",
(".mov",): "video/mov",
(".mpeg", ".mpg"): "video/mpeg",
(".avi",): "video/avi",
(".wmv",): "video/wmv",
(".mpegps",): "video/mpegps",
(".flv",): "video/flv",
# Audio
(".mp3",): "audio/mp3",
(".wav",): "audio/wav",
(".mpeg",): "audio/mpeg",
# Documents
(".pdf",): "application/pdf",
(".txt",): "text/plain",
}
# Check each extension group against the URL
for extensions, mime_type in mime_types.items():
if any(url.endswith(ext) for ext in extensions):
return mime_type
return None

View file

@ -2258,6 +2258,14 @@ def _parse_content_type(content_type: str) -> str:
return m.get_content_type()
def _parse_mime_type(base64_data: str) -> Optional[str]:
mime_type_match = re.match(r"data:(.*?);base64", base64_data)
if mime_type_match:
return mime_type_match.group(1)
else:
return None
class BedrockImageProcessor:
"""Handles both sync and async image processing for Bedrock conversations."""

View file

@ -106,74 +106,63 @@ class ChunkProcessor:
def get_combined_tool_content(
self, tool_call_chunks: List[Dict[str, Any]]
) -> List[ChatCompletionMessageToolCall]:
argument_list: List[str] = []
delta = tool_call_chunks[0]["choices"][0]["delta"]
id = None
name = None
type = None
tool_calls_list: List[ChatCompletionMessageToolCall] = []
prev_index = None
prev_name = None
prev_id = None
curr_id = None
curr_index = 0
tool_call_map: Dict[
int, Dict[str, Any]
] = {} # Map to store tool calls by index
for chunk in tool_call_chunks:
choices = chunk["choices"]
for choice in choices:
delta = choice.get("delta", {})
tool_calls = delta.get("tool_calls", "")
# Check if a tool call is present
if tool_calls and tool_calls[0].function is not None:
if tool_calls[0].id:
id = tool_calls[0].id
curr_id = id
if prev_id is None:
prev_id = curr_id
if tool_calls[0].index:
curr_index = tool_calls[0].index
if tool_calls[0].function.arguments:
# Now, tool_calls is expected to be a dictionary
arguments = tool_calls[0].function.arguments
argument_list.append(arguments)
if tool_calls[0].function.name:
name = tool_calls[0].function.name
if tool_calls[0].type:
type = tool_calls[0].type
if prev_index is None:
prev_index = curr_index
if prev_name is None:
prev_name = name
if curr_index != prev_index: # new tool call
combined_arguments = "".join(argument_list)
tool_calls = delta.get("tool_calls", [])
for tool_call in tool_calls:
if not tool_call or not hasattr(tool_call, "function"):
continue
index = getattr(tool_call, "index", 0)
if index not in tool_call_map:
tool_call_map[index] = {
"id": None,
"name": None,
"type": None,
"arguments": [],
}
if hasattr(tool_call, "id") and tool_call.id:
tool_call_map[index]["id"] = tool_call.id
if hasattr(tool_call, "type") and tool_call.type:
tool_call_map[index]["type"] = tool_call.type
if hasattr(tool_call, "function"):
if (
hasattr(tool_call.function, "name")
and tool_call.function.name
):
tool_call_map[index]["name"] = tool_call.function.name
if (
hasattr(tool_call.function, "arguments")
and tool_call.function.arguments
):
tool_call_map[index]["arguments"].append(
tool_call.function.arguments
)
# Convert the map to a list of tool calls
for index in sorted(tool_call_map.keys()):
tool_call_data = tool_call_map[index]
if tool_call_data["id"] and tool_call_data["name"]:
combined_arguments = "".join(tool_call_data["arguments"]) or "{}"
tool_calls_list.append(
ChatCompletionMessageToolCall(
id=prev_id,
id=tool_call_data["id"],
function=Function(
arguments=combined_arguments,
name=prev_name,
name=tool_call_data["name"],
),
type=type,
type=tool_call_data["type"] or "function",
)
)
argument_list = [] # reset
prev_index = curr_index
prev_id = curr_id
prev_name = name
combined_arguments = (
"".join(argument_list) or "{}"
) # base case, return empty dict
tool_calls_list.append(
ChatCompletionMessageToolCall(
id=id,
type="function",
function=Function(
arguments=combined_arguments,
name=name,
),
)
)
return tool_calls_list

View file

@ -29,6 +29,7 @@ from litellm.types.llms.anthropic import (
UsageDelta,
)
from litellm.types.llms.openai import (
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
)
@ -501,18 +502,19 @@ class ModelResponseIterator:
) -> Tuple[
str,
Optional[ChatCompletionToolCallChunk],
List[ChatCompletionThinkingBlock],
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]],
Dict[str, Any],
]:
"""
Helper function to handle the content block delta
"""
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
provider_specific_fields = {}
content_block = ContentBlockDelta(**chunk) # type: ignore
thinking_blocks: List[ChatCompletionThinkingBlock] = []
thinking_blocks: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
self.content_blocks.append(content_block)
if "text" in content_block["delta"]:
@ -541,20 +543,25 @@ class ModelResponseIterator:
)
]
provider_specific_fields["thinking_blocks"] = thinking_blocks
return text, tool_use, thinking_blocks, provider_specific_fields
def _handle_reasoning_content(
self, thinking_blocks: List[ChatCompletionThinkingBlock]
self,
thinking_blocks: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
],
) -> Optional[str]:
"""
Handle the reasoning content
"""
reasoning_content = None
for block in thinking_blocks:
thinking_content = cast(Optional[str], block.get("thinking"))
if reasoning_content is None:
reasoning_content = ""
if "thinking" in block:
reasoning_content += block["thinking"]
if thinking_content is not None:
reasoning_content += thinking_content
return reasoning_content
def chunk_parser(self, chunk: dict) -> ModelResponseStream:
@ -567,7 +574,13 @@ class ModelResponseIterator:
usage: Optional[Usage] = None
provider_specific_fields: Dict[str, Any] = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
index = int(chunk.get("index", 0))
if type_chunk == "content_block_delta":
@ -605,6 +618,15 @@ class ModelResponseIterator:
},
"index": self.tool_index,
}
elif (
content_block_start["content_block"]["type"] == "redacted_thinking"
):
thinking_blocks = [
ChatCompletionRedactedThinkingBlock(
type="redacted_thinking",
data=content_block_start["content_block"]["data"],
)
]
elif type_chunk == "content_block_stop":
ContentBlockStop(**chunk) # type: ignore
# check if tool call content block

View file

@ -7,6 +7,9 @@ import httpx
import litellm
from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -27,6 +30,7 @@ from litellm.types.llms.openai import (
REASONING_EFFORT,
AllMessageValues,
ChatCompletionCachedContent,
ChatCompletionRedactedThinkingBlock,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
@ -44,7 +48,7 @@ from litellm.utils import (
token_counter,
)
from ..common_utils import AnthropicError, process_anthropic_headers
from ..common_utils import AnthropicError, AnthropicModelInfo, process_anthropic_headers
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
@ -54,7 +58,7 @@ else:
LoggingClass = Any
class AnthropicConfig(BaseConfig):
class AnthropicConfig(AnthropicModelInfo, BaseConfig):
"""
Reference: https://docs.anthropic.com/claude/reference/messages_post
@ -127,41 +131,6 @@ class AnthropicConfig(BaseConfig):
"anthropic-beta": "prompt-caching-2024-07-31",
}
def get_anthropic_headers(
self,
api_key: str,
anthropic_version: Optional[str] = None,
computer_tool_used: bool = False,
prompt_caching_set: bool = False,
pdf_used: bool = False,
is_vertex_request: bool = False,
user_anthropic_beta_headers: Optional[List[str]] = None,
) -> dict:
betas = set()
if prompt_caching_set:
betas.add("prompt-caching-2024-07-31")
if computer_tool_used:
betas.add("computer-use-2024-10-22")
if pdf_used:
betas.add("pdfs-2024-09-25")
headers = {
"anthropic-version": anthropic_version or "2023-06-01",
"x-api-key": api_key,
"accept": "application/json",
"content-type": "application/json",
}
if user_anthropic_beta_headers is not None:
betas.update(user_anthropic_beta_headers)
# Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
if is_vertex_request is True:
pass
elif len(betas) > 0:
headers["anthropic-beta"] = ",".join(betas)
return headers
def _map_tool_choice(
self, tool_choice: Optional[str], parallel_tool_use: Optional[bool]
) -> Optional[AnthropicMessagesToolChoice]:
@ -311,11 +280,20 @@ class AnthropicConfig(BaseConfig):
if reasoning_effort is None:
return None
elif reasoning_effort == "low":
return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
)
elif reasoning_effort == "medium":
return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
elif reasoning_effort == "high":
return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
)
else:
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
@ -446,49 +424,6 @@ class AnthropicConfig(BaseConfig):
)
return _tool
def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
"""
Return if {"cache_control": ..} in message content block
Used to check if anthropic prompt caching headers need to be set.
"""
for message in messages:
if message.get("cache_control", None) is not None:
return True
_message_content = message.get("content")
if _message_content is not None and isinstance(_message_content, list):
for content in _message_content:
if "cache_control" in content:
return True
return False
def is_computer_tool_used(
self, tools: Optional[List[AllAnthropicToolsValues]]
) -> bool:
if tools is None:
return False
for tool in tools:
if "type" in tool and tool["type"].startswith("computer_"):
return True
return False
def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
"""
Set to true if media passed into messages.
"""
for message in messages:
if (
"content" in message
and message["content"] is not None
and isinstance(message["content"], list)
):
for content in message["content"]:
if "type" in content and content["type"] != "text":
return True
return False
def translate_system_message(
self, messages: List[AllMessageValues]
) -> List[AnthropicSystemMessageContent]:
@ -641,13 +576,21 @@ class AnthropicConfig(BaseConfig):
) -> Tuple[
str,
Optional[List[Any]],
Optional[List[ChatCompletionThinkingBlock]],
Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
],
Optional[str],
List[ChatCompletionToolCallChunk],
]:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []
for idx, content in enumerate(completion_response["content"]):
@ -666,20 +609,30 @@ class AnthropicConfig(BaseConfig):
index=idx,
)
)
## CITATIONS
if content.get("citations", None) is not None:
if citations is None:
citations = []
citations.append(content["citations"])
if content.get("thinking", None) is not None:
elif content.get("thinking", None) is not None:
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
elif content["type"] == "redacted_thinking":
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(
cast(ChatCompletionRedactedThinkingBlock, content)
)
## CITATIONS
if content.get("citations") is not None:
if citations is None:
citations = []
citations.append(content["citations"])
if thinking_blocks is not None:
reasoning_content = ""
for block in thinking_blocks:
if "thinking" in block:
reasoning_content += block["thinking"]
thinking_content = cast(Optional[str], block.get("thinking"))
if thinking_content is not None:
reasoning_content += thinking_content
return text_content, citations, thinking_blocks, reasoning_content, tool_calls
def calculate_usage(
@ -769,7 +722,13 @@ class AnthropicConfig(BaseConfig):
else:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []
@ -862,47 +821,3 @@ class AnthropicConfig(BaseConfig):
message=error_message,
headers=cast(httpx.Headers, headers),
)
def _get_user_anthropic_beta_headers(
self, anthropic_beta_header: Optional[str]
) -> Optional[List[str]]:
if anthropic_beta_header is None:
return None
return anthropic_beta_header.split(",")
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> Dict:
if api_key is None:
raise litellm.AuthenticationError(
message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
llm_provider="anthropic",
model=model,
)
tools = optional_params.get("tools")
prompt_caching_set = self.is_cache_control_set(messages=messages)
computer_tool_used = self.is_computer_tool_used(tools=tools)
pdf_used = self.is_pdf_used(messages=messages)
user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
anthropic_beta_header=headers.get("anthropic-beta")
)
anthropic_headers = self.get_anthropic_headers(
computer_tool_used=computer_tool_used,
prompt_caching_set=prompt_caching_set,
pdf_used=pdf_used,
api_key=api_key,
is_vertex_request=optional_params.get("is_vertex_request", False),
user_anthropic_beta_headers=user_anthropic_beta_headers,
)
headers = {**headers, **anthropic_headers}
return headers

View file

@ -2,7 +2,7 @@
This file contains common utils for anthropic calls.
"""
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union
import httpx
@ -10,6 +10,8 @@ import litellm
from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.anthropic import AllAnthropicToolsValues
from litellm.types.llms.openai import AllMessageValues
class AnthropicError(BaseLLMException):
@ -23,6 +25,128 @@ class AnthropicError(BaseLLMException):
class AnthropicModelInfo(BaseLLMModelInfo):
def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
"""
Return if {"cache_control": ..} in message content block
Used to check if anthropic prompt caching headers need to be set.
"""
for message in messages:
if message.get("cache_control", None) is not None:
return True
_message_content = message.get("content")
if _message_content is not None and isinstance(_message_content, list):
for content in _message_content:
if "cache_control" in content:
return True
return False
def is_computer_tool_used(
self, tools: Optional[List[AllAnthropicToolsValues]]
) -> bool:
if tools is None:
return False
for tool in tools:
if "type" in tool and tool["type"].startswith("computer_"):
return True
return False
def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
"""
Set to true if media passed into messages.
"""
for message in messages:
if (
"content" in message
and message["content"] is not None
and isinstance(message["content"], list)
):
for content in message["content"]:
if "type" in content and content["type"] != "text":
return True
return False
def _get_user_anthropic_beta_headers(
self, anthropic_beta_header: Optional[str]
) -> Optional[List[str]]:
if anthropic_beta_header is None:
return None
return anthropic_beta_header.split(",")
def get_anthropic_headers(
self,
api_key: str,
anthropic_version: Optional[str] = None,
computer_tool_used: bool = False,
prompt_caching_set: bool = False,
pdf_used: bool = False,
is_vertex_request: bool = False,
user_anthropic_beta_headers: Optional[List[str]] = None,
) -> dict:
betas = set()
if prompt_caching_set:
betas.add("prompt-caching-2024-07-31")
if computer_tool_used:
betas.add("computer-use-2024-10-22")
if pdf_used:
betas.add("pdfs-2024-09-25")
headers = {
"anthropic-version": anthropic_version or "2023-06-01",
"x-api-key": api_key,
"accept": "application/json",
"content-type": "application/json",
}
if user_anthropic_beta_headers is not None:
betas.update(user_anthropic_beta_headers)
# Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
if is_vertex_request is True:
pass
elif len(betas) > 0:
headers["anthropic-beta"] = ",".join(betas)
return headers
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> Dict:
if api_key is None:
raise litellm.AuthenticationError(
message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
llm_provider="anthropic",
model=model,
)
tools = optional_params.get("tools")
prompt_caching_set = self.is_cache_control_set(messages=messages)
computer_tool_used = self.is_computer_tool_used(tools=tools)
pdf_used = self.is_pdf_used(messages=messages)
user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
anthropic_beta_header=headers.get("anthropic-beta")
)
anthropic_headers = self.get_anthropic_headers(
computer_tool_used=computer_tool_used,
prompt_caching_set=prompt_caching_set,
pdf_used=pdf_used,
api_key=api_key,
is_vertex_request=optional_params.get("is_vertex_request", False),
user_anthropic_beta_headers=user_anthropic_beta_headers,
)
headers = {**headers, **anthropic_headers}
return headers
@staticmethod
def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
return (

View file

@ -1,6 +1,6 @@
"""
- call /messages on Anthropic API
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Ensure requests are logged in the DB - stream + non-stream
"""
@ -43,7 +43,9 @@ class AnthropicMessagesHandler:
from litellm.proxy.pass_through_endpoints.success_handler import (
PassThroughEndpointLogging,
)
from litellm.proxy.pass_through_endpoints.types import EndpointType
from litellm.types.passthrough_endpoints.pass_through_endpoints import (
EndpointType,
)
# Create success handler object
passthrough_success_handler_obj = PassThroughEndpointLogging()
@ -98,11 +100,11 @@ async def anthropic_messages(
api_base=optional_params.api_base,
api_key=optional_params.api_key,
)
anthropic_messages_provider_config: Optional[
BaseAnthropicMessagesConfig
] = ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(_custom_llm_provider),
anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = (
ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(_custom_llm_provider),
)
)
if anthropic_messages_provider_config is None:
raise ValueError(

View file

@ -288,6 +288,7 @@ class AzureAssistantsAPI(BaseAzureLLM):
timeout=timeout,
max_retries=max_retries,
client=client,
litellm_params=litellm_params,
)
thread_message: OpenAIMessage = openai_client.beta.threads.messages.create( # type: ignore

View file

@ -125,14 +125,22 @@ class AzureOpenAIConfig(BaseConfig):
) -> bool:
"""
- check if api_version is supported for response_format
- returns True if the API version is equal to or newer than the supported version
"""
api_year = int(api_version_year)
api_month = int(api_version_month)
supported_year = int(API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT)
supported_month = int(API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT)
is_supported = (
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
)
return is_supported
# If the year is greater than supported year, it's definitely supported
if api_year > supported_year:
return True
# If the year is less than supported year, it's not supported
elif api_year < supported_year:
return False
# If same year, check if month is >= supported month
else:
return api_month >= supported_month
def map_openai_params(
self,
@ -202,6 +210,7 @@ class AzureOpenAIConfig(BaseConfig):
is_response_format_supported_api_version
and _is_response_format_supported_model
)
optional_params = self._add_response_format_to_tools(
optional_params=optional_params,
value=value,

View file

@ -79,7 +79,7 @@ class AzureOpenAIO1Config(OpenAIOSeriesConfig):
return True
def is_o_series_model(self, model: str) -> bool:
return "o1" in model or "o3" in model or "o_series/" in model
return "o1" in model or "o3" in model or "o4" in model or "o_series/" in model
def transform_request(
self,

View file

@ -61,7 +61,7 @@ def process_azure_headers(headers: Union[httpx.Headers, dict]) -> dict:
return {**llm_response_headers, **openai_headers}
def get_azure_ad_token_from_entrata_id(
def get_azure_ad_token_from_entra_id(
tenant_id: str,
client_id: str,
client_secret: str,
@ -81,7 +81,7 @@ def get_azure_ad_token_from_entrata_id(
"""
from azure.identity import ClientSecretCredential, get_bearer_token_provider
verbose_logger.debug("Getting Azure AD Token from Entrata ID")
verbose_logger.debug("Getting Azure AD Token from Entra ID")
if tenant_id.startswith("os.environ/"):
_tenant_id = get_secret_str(tenant_id)
@ -309,21 +309,30 @@ class BaseAzureLLM(BaseOpenAILLM):
azure_ad_token_provider: Optional[Callable[[], str]] = None
# If we have api_key, then we have higher priority
azure_ad_token = litellm_params.get("azure_ad_token")
tenant_id = litellm_params.get("tenant_id")
client_id = litellm_params.get("client_id")
client_secret = litellm_params.get("client_secret")
azure_username = litellm_params.get("azure_username")
azure_password = litellm_params.get("azure_password")
tenant_id = litellm_params.get("tenant_id", os.getenv("AZURE_TENANT_ID"))
client_id = litellm_params.get("client_id", os.getenv("AZURE_CLIENT_ID"))
client_secret = litellm_params.get(
"client_secret", os.getenv("AZURE_CLIENT_SECRET")
)
azure_username = litellm_params.get(
"azure_username", os.getenv("AZURE_USERNAME")
)
azure_password = litellm_params.get(
"azure_password", os.getenv("AZURE_PASSWORD")
)
max_retries = litellm_params.get("max_retries")
timeout = litellm_params.get("timeout")
if not api_key and tenant_id and client_id and client_secret:
verbose_logger.debug("Using Azure AD Token Provider for Azure Auth")
azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
verbose_logger.debug(
"Using Azure AD Token Provider from Entra ID for Azure Auth"
)
azure_ad_token_provider = get_azure_ad_token_from_entra_id(
tenant_id=tenant_id,
client_id=client_id,
client_secret=client_secret,
)
if azure_username and azure_password and client_id:
verbose_logger.debug("Using Azure Username and Password for Azure Auth")
azure_ad_token_provider = get_azure_ad_token_from_username_password(
azure_username=azure_username,
azure_password=azure_password,
@ -331,12 +340,16 @@ class BaseAzureLLM(BaseOpenAILLM):
)
if azure_ad_token is not None and azure_ad_token.startswith("oidc/"):
verbose_logger.debug("Using Azure OIDC Token for Azure Auth")
azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
elif (
not api_key
and azure_ad_token_provider is None
and litellm.enable_azure_ad_token_refresh is True
):
verbose_logger.debug(
"Using Azure AD token provider based on Service Principal with Secret workflow for Azure Auth"
)
try:
azure_ad_token_provider = get_azure_ad_token_provider()
except ValueError:

View file

@ -0,0 +1,138 @@
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, cast
import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import *
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
from litellm.utils import _add_path_to_api_base
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
LiteLLMLoggingObj = _LiteLLMLoggingObj
else:
LiteLLMLoggingObj = Any
class AzureOpenAIResponsesAPIConfig(OpenAIResponsesAPIConfig):
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
) -> dict:
api_key = (
api_key
or litellm.api_key
or litellm.azure_key
or get_secret_str("AZURE_OPENAI_API_KEY")
or get_secret_str("AZURE_API_KEY")
)
headers.update(
{
"Authorization": f"Bearer {api_key}",
}
)
return headers
def get_complete_url(
self,
api_base: Optional[str],
litellm_params: dict,
) -> str:
"""
Constructs a complete URL for the API request.
Args:
- api_base: Base URL, e.g.,
"https://litellm8397336933.openai.azure.com"
OR
"https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
- model: Model name.
- optional_params: Additional query parameters, including "api_version".
- stream: If streaming is required (optional).
Returns:
- A complete URL string, e.g.,
"https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
"""
api_base = api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")
if api_base is None:
raise ValueError(
f"api_base is required for Azure AI Studio. Please set the api_base parameter. Passed `api_base={api_base}`"
)
original_url = httpx.URL(api_base)
# Extract api_version or use default
api_version = cast(Optional[str], litellm_params.get("api_version"))
# Create a new dictionary with existing params
query_params = dict(original_url.params)
# Add api_version if needed
if "api-version" not in query_params and api_version:
query_params["api-version"] = api_version
# Add the path to the base URL
if "/openai/responses" not in api_base:
new_url = _add_path_to_api_base(
api_base=api_base, ending_path="/openai/responses"
)
else:
new_url = api_base
# Use the new query_params dictionary
final_url = httpx.URL(new_url).copy_with(params=query_params)
return str(final_url)
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
"""
Transform the delete response API request into a URL and data
Azure OpenAI API expects the following request:
- DELETE /openai/responses/{response_id}?api-version=xxx
This function handles URLs with query parameters by inserting the response_id
at the correct location (before any query parameters).
"""
from urllib.parse import urlparse, urlunparse
# Parse the URL to separate its components
parsed_url = urlparse(api_base)
# Insert the response_id at the end of the path component
# Remove trailing slash if present to avoid double slashes
path = parsed_url.path.rstrip("/")
new_path = f"{path}/{response_id}"
# Reconstruct the URL with all original components but with the modified path
delete_url = urlunparse(
(
parsed_url.scheme, # http, https
parsed_url.netloc, # domain name, port
new_path, # path with response_id added
parsed_url.params, # parameters
parsed_url.query, # query string
parsed_url.fragment, # fragment
)
)
data: Dict = {}
verbose_logger.debug(f"delete response url={delete_url}")
return delete_url, data

View file

@ -1,3 +1,4 @@
import enum
from typing import Any, List, Optional, Tuple, cast
from urllib.parse import urlparse
@ -19,6 +20,10 @@ from litellm.types.utils import ModelResponse, ProviderField
from litellm.utils import _add_path_to_api_base, supports_tool_choice
class AzureFoundryErrorStrings(str, enum.Enum):
SET_EXTRA_PARAMETERS_TO_PASS_THROUGH = "Set extra-parameters to 'pass-through'"
class AzureAIStudioConfig(OpenAIConfig):
def get_supported_openai_params(self, model: str) -> List:
model_supports_tool_choice = True # azure ai supports this by default
@ -240,12 +245,18 @@ class AzureAIStudioConfig(OpenAIConfig):
) -> bool:
should_drop_params = litellm_params.get("drop_params") or litellm.drop_params
error_text = e.response.text
if should_drop_params and "Extra inputs are not permitted" in error_text:
return True
elif (
"unknown field: parameter index is not a valid field" in error_text
): # remove index from tool calls
return True
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in error_text
): # remove extra-parameters from tool calls
return True
return super().should_retry_llm_api_inside_llm_translation_on_http_error(
e=e, litellm_params=litellm_params
)
@ -265,5 +276,46 @@ class AzureAIStudioConfig(OpenAIConfig):
litellm.remove_index_from_tool_calls(
messages=_messages,
)
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in e.response.text
):
request_data = self._drop_extra_params_from_request_data(
request_data, e.response.text
)
data = drop_params_from_unprocessable_entity_error(e=e, data=request_data)
return data
def _drop_extra_params_from_request_data(
self, request_data: dict, error_text: str
) -> dict:
params_to_drop = self._extract_params_to_drop_from_error_text(error_text)
if params_to_drop:
for param in params_to_drop:
if param in request_data:
request_data.pop(param, None)
return request_data
def _extract_params_to_drop_from_error_text(
self, error_text: str
) -> Optional[List[str]]:
"""
Error text looks like this"
"Extra parameters ['stream_options', 'extra-parameters'] are not allowed when extra-parameters is not set or set to be 'error'.
"""
import re
# Extract parameters within square brackets
match = re.search(r"\[(.*?)\]", error_text)
if not match:
return []
# Parse the extracted string into a list of parameter names
params_str = match.group(1)
params = []
for param in params_str.split(","):
# Clean up the parameter name (remove quotes, spaces)
clean_param = param.strip().strip("'").strip('"')
if clean_param:
params.append(clean_param)
return params

View file

@ -1,9 +1,16 @@
import json
from abc import abstractmethod
from typing import Optional, Union
from typing import List, Optional, Union, cast
import litellm
from litellm.types.utils import GenericStreamingChunk, ModelResponseStream
from litellm.types.utils import (
Choices,
Delta,
GenericStreamingChunk,
ModelResponse,
ModelResponseStream,
StreamingChoices,
)
class BaseModelResponseIterator:
@ -121,6 +128,59 @@ class BaseModelResponseIterator:
raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
class MockResponseIterator: # for returning ai21 streaming responses
def __init__(
self, model_response: ModelResponse, json_mode: Optional[bool] = False
):
self.model_response = model_response
self.json_mode = json_mode
self.is_done = False
# Sync iterator
def __iter__(self):
return self
def _chunk_parser(self, chunk_data: ModelResponse) -> ModelResponseStream:
try:
streaming_choices: List[StreamingChoices] = []
for choice in chunk_data.choices:
streaming_choices.append(
StreamingChoices(
index=choice.index,
delta=Delta(
**cast(Choices, choice).message.model_dump(),
),
finish_reason=choice.finish_reason,
)
)
processed_chunk = ModelResponseStream(
id=chunk_data.id,
object="chat.completion",
created=chunk_data.created,
model=chunk_data.model,
choices=streaming_choices,
)
return processed_chunk
except Exception as e:
raise ValueError(f"Failed to decode chunk: {chunk_data}. Error: {e}")
def __next__(self):
if self.is_done:
raise StopIteration
self.is_done = True
return self._chunk_parser(self.model_response)
# Async iterator
def __aiter__(self):
return self
async def __anext__(self):
if self.is_done:
raise StopAsyncIteration
self.is_done = True
return self._chunk_parser(self.model_response)
class FakeStreamResponseIterator:
def __init__(self, model_response, json_mode: Optional[bool] = False):
self.model_response = model_response

View file

@ -44,6 +44,19 @@ class BaseLLMModelInfo(ABC):
def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
pass
@abstractmethod
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
pass
@staticmethod
@abstractmethod
def get_base_model(model: str) -> Optional[str]:

View file

@ -1,6 +1,6 @@
import types
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
import httpx
@ -10,6 +10,7 @@ from litellm.types.llms.openai import (
ResponsesAPIResponse,
ResponsesAPIStreamingResponse,
)
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
if TYPE_CHECKING:
@ -73,8 +74,7 @@ class BaseResponsesAPIConfig(ABC):
def get_complete_url(
self,
api_base: Optional[str],
model: str,
stream: Optional[bool] = None,
litellm_params: dict,
) -> str:
"""
OPTIONAL
@ -119,6 +119,31 @@ class BaseResponsesAPIConfig(ABC):
"""
pass
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
@abstractmethod
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
pass
@abstractmethod
def transform_delete_response_api_response(
self,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> DeleteResponseResult:
pass
#########################################################
########## END DELETE RESPONSE API TRANSFORMATION ##########
#########################################################
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:

View file

@ -22,6 +22,7 @@ from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMExcepti
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionRedactedThinkingBlock,
ChatCompletionResponseMessage,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
@ -375,25 +376,27 @@ class AmazonConverseConfig(BaseConfig):
system_content_blocks: List[SystemContentBlock] = []
for idx, message in enumerate(messages):
if message["role"] == "system":
_system_content_block: Optional[SystemContentBlock] = None
_cache_point_block: Optional[SystemContentBlock] = None
if isinstance(message["content"], str) and len(message["content"]) > 0:
_system_content_block = SystemContentBlock(text=message["content"])
_cache_point_block = self._get_cache_point_block(
system_prompt_indices.append(idx)
if isinstance(message["content"], str) and message["content"]:
system_content_blocks.append(
SystemContentBlock(text=message["content"])
)
cache_block = self._get_cache_point_block(
message, block_type="system"
)
if cache_block:
system_content_blocks.append(cache_block)
elif isinstance(message["content"], list):
for m in message["content"]:
if m.get("type", "") == "text" and len(m["text"]) > 0:
_system_content_block = SystemContentBlock(text=m["text"])
_cache_point_block = self._get_cache_point_block(
if m.get("type") == "text" and m.get("text"):
system_content_blocks.append(
SystemContentBlock(text=m["text"])
)
cache_block = self._get_cache_point_block(
m, block_type="system"
)
if _system_content_block is not None:
system_content_blocks.append(_system_content_block)
if _cache_point_block is not None:
system_content_blocks.append(_cache_point_block)
system_prompt_indices.append(idx)
if cache_block:
system_content_blocks.append(cache_block)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)
@ -627,9 +630,11 @@ class AmazonConverseConfig(BaseConfig):
def _transform_thinking_blocks(
self, thinking_blocks: List[BedrockConverseReasoningContentBlock]
) -> List[ChatCompletionThinkingBlock]:
) -> List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]:
"""Return a consistent format for thinking blocks between Anthropic and Bedrock."""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
thinking_blocks_list: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
for block in thinking_blocks:
if "reasoningText" in block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
@ -640,6 +645,11 @@ class AmazonConverseConfig(BaseConfig):
if _signature is not None:
_thinking_block["signature"] = _signature
thinking_blocks_list.append(_thinking_block)
elif "redactedContent" in block:
_redacted_block = ChatCompletionRedactedThinkingBlock(
type="redacted_thinking", data=block["redactedContent"]
)
thinking_blocks_list.append(_redacted_block)
return thinking_blocks_list
def _transform_usage(self, usage: ConverseTokenUsageBlock) -> Usage:

View file

@ -50,6 +50,7 @@ from litellm.llms.custom_httpx.http_handler import (
)
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
@ -1255,19 +1256,33 @@ class AWSEventStreamDecoder:
def translate_thinking_blocks(
self, thinking_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[List[ChatCompletionThinkingBlock]]:
) -> Optional[
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]
]:
"""
Translate the thinking blocks to a string
"""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
thinking_blocks_list: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
_thinking_block: Optional[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = None
if "text" in thinking_block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
_thinking_block["thinking"] = thinking_block["text"]
elif "signature" in thinking_block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
_thinking_block["signature"] = thinking_block["signature"]
_thinking_block["thinking"] = "" # consistent with anthropic response
thinking_blocks_list.append(_thinking_block)
elif "redactedContent" in thinking_block:
_thinking_block = ChatCompletionRedactedThinkingBlock(
type="redacted_thinking", data=thinking_block["redactedContent"]
)
if _thinking_block is not None:
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
@ -1279,31 +1294,44 @@ class AWSEventStreamDecoder:
usage: Optional[Usage] = None
provider_specific_fields: dict = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
index = int(chunk_data.get("contentBlockIndex", 0))
if "start" in chunk_data:
start_obj = ContentBlockStartEvent(**chunk_data["start"])
self.content_blocks = [] # reset
if (
start_obj is not None
and "toolUse" in start_obj
and start_obj["toolUse"] is not None
):
## check tool name was formatted by litellm
_response_tool_name = start_obj["toolUse"]["name"]
response_tool_name = get_bedrock_tool_name(
response_tool_name=_response_tool_name
)
tool_use = {
"id": start_obj["toolUse"]["toolUseId"],
"type": "function",
"function": {
"name": response_tool_name,
"arguments": "",
},
"index": index,
}
if start_obj is not None:
if "toolUse" in start_obj and start_obj["toolUse"] is not None:
## check tool name was formatted by litellm
_response_tool_name = start_obj["toolUse"]["name"]
response_tool_name = get_bedrock_tool_name(
response_tool_name=_response_tool_name
)
tool_use = {
"id": start_obj["toolUse"]["toolUseId"],
"type": "function",
"function": {
"name": response_tool_name,
"arguments": "",
},
"index": index,
}
elif (
"reasoningContent" in start_obj
and start_obj["reasoningContent"] is not None
): # redacted thinking can be in start object
thinking_blocks = self.translate_thinking_blocks(
start_obj["reasoningContent"]
)
provider_specific_fields = {
"reasoningContent": start_obj["reasoningContent"],
}
elif "delta" in chunk_data:
delta_obj = ContentBlockDeltaEvent(**chunk_data["delta"])
self.content_blocks.append(delta_obj)

View file

@ -44,7 +44,18 @@ class AmazonBedrockGlobalConfig:
)
def get_ap_regions(self) -> List[str]:
return ["ap-northeast-1", "ap-northeast-2", "ap-northeast-3", "ap-south-1"]
"""
Source: https://www.aws-services.info/bedrock.html
"""
return [
"ap-northeast-1", # Asia Pacific (Tokyo)
"ap-northeast-2", # Asia Pacific (Seoul)
"ap-northeast-3", # Asia Pacific (Osaka)
"ap-south-1", # Asia Pacific (Mumbai)
"ap-south-2", # Asia Pacific (Hyderabad)
"ap-southeast-1", # Asia Pacific (Singapore)
"ap-southeast-2", # Asia Pacific (Sydney)
]
def get_sa_regions(self) -> List[str]:
return ["sa-east-1"]
@ -54,10 +65,14 @@ class AmazonBedrockGlobalConfig:
Source: https://www.aws-services.info/bedrock.html
"""
return [
"eu-west-1",
"eu-west-2",
"eu-west-3",
"eu-central-1",
"eu-west-1", # Europe (Ireland)
"eu-west-2", # Europe (London)
"eu-west-3", # Europe (Paris)
"eu-central-1", # Europe (Frankfurt)
"eu-central-2", # Europe (Zurich)
"eu-south-1", # Europe (Milan)
"eu-south-2", # Europe (Spain)
"eu-north-1", # Europe (Stockholm)
]
def get_ca_regions(self) -> List[str]:
@ -68,11 +83,12 @@ class AmazonBedrockGlobalConfig:
Source: https://www.aws-services.info/bedrock.html
"""
return [
"us-east-2",
"us-east-1",
"us-west-1",
"us-west-2",
"us-gov-west-1",
"us-east-1", # US East (N. Virginia)
"us-east-2", # US East (Ohio)
"us-west-1", # US West (N. California)
"us-west-2", # US West (Oregon)
"us-gov-east-1", # AWS GovCloud (US-East)
"us-gov-west-1", # AWS GovCloud (US-West)
]

View file

@ -0,0 +1,356 @@
import time
from typing import TYPE_CHECKING, Any, AsyncIterator, Iterator, List, Optional, Union
import httpx
import litellm
from litellm.litellm_core_utils.prompt_templates.factory import cohere_messages_pt_v2
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.types.llms.cohere import CohereV2ChatResponse
from litellm.types.llms.openai import AllMessageValues, ChatCompletionToolCallChunk
from litellm.types.utils import ModelResponse, Usage
from ..common_utils import CohereError
from ..common_utils import ModelResponseIterator as CohereModelResponseIterator
from ..common_utils import validate_environment as cohere_validate_environment
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
LiteLLMLoggingObj = _LiteLLMLoggingObj
else:
LiteLLMLoggingObj = Any
class CohereV2ChatConfig(BaseConfig):
"""
Configuration class for Cohere's API interface.
Args:
preamble (str, optional): When specified, the default Cohere preamble will be replaced with the provided one.
chat_history (List[Dict[str, str]], optional): A list of previous messages between the user and the model.
generation_id (str, optional): Unique identifier for the generated reply.
response_id (str, optional): Unique identifier for the response.
conversation_id (str, optional): An alternative to chat_history, creates or resumes a persisted conversation.
prompt_truncation (str, optional): Dictates how the prompt will be constructed. Options: 'AUTO', 'AUTO_PRESERVE_ORDER', 'OFF'.
connectors (List[Dict[str, str]], optional): List of connectors (e.g., web-search) to enrich the model's reply.
search_queries_only (bool, optional): When true, the response will only contain a list of generated search queries.
documents (List[Dict[str, str]], optional): A list of relevant documents that the model can cite.
temperature (float, optional): A non-negative float that tunes the degree of randomness in generation.
max_tokens (int, optional): The maximum number of tokens the model will generate as part of the response.
k (int, optional): Ensures only the top k most likely tokens are considered for generation at each step.
p (float, optional): Ensures that only the most likely tokens, with total probability mass of p, are considered for generation.
frequency_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
presence_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
tools (List[Dict[str, str]], optional): A list of available tools (functions) that the model may suggest invoking.
tool_results (List[Dict[str, Any]], optional): A list of results from invoking tools.
seed (int, optional): A seed to assist reproducibility of the model's response.
"""
preamble: Optional[str] = None
chat_history: Optional[list] = None
generation_id: Optional[str] = None
response_id: Optional[str] = None
conversation_id: Optional[str] = None
prompt_truncation: Optional[str] = None
connectors: Optional[list] = None
search_queries_only: Optional[bool] = None
documents: Optional[list] = None
temperature: Optional[int] = None
max_tokens: Optional[int] = None
k: Optional[int] = None
p: Optional[int] = None
frequency_penalty: Optional[int] = None
presence_penalty: Optional[int] = None
tools: Optional[list] = None
tool_results: Optional[list] = None
seed: Optional[int] = None
def __init__(
self,
preamble: Optional[str] = None,
chat_history: Optional[list] = None,
generation_id: Optional[str] = None,
response_id: Optional[str] = None,
conversation_id: Optional[str] = None,
prompt_truncation: Optional[str] = None,
connectors: Optional[list] = None,
search_queries_only: Optional[bool] = None,
documents: Optional[list] = None,
temperature: Optional[int] = None,
max_tokens: Optional[int] = None,
k: Optional[int] = None,
p: Optional[int] = None,
frequency_penalty: Optional[int] = None,
presence_penalty: Optional[int] = None,
tools: Optional[list] = None,
tool_results: Optional[list] = None,
seed: Optional[int] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
return cohere_validate_environment(
headers=headers,
model=model,
messages=messages,
optional_params=optional_params,
api_key=api_key,
)
def get_supported_openai_params(self, model: str) -> List[str]:
return [
"stream",
"temperature",
"max_tokens",
"top_p",
"frequency_penalty",
"presence_penalty",
"stop",
"n",
"tools",
"tool_choice",
"seed",
"extra_headers",
]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
for param, value in non_default_params.items():
if param == "stream":
optional_params["stream"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "max_tokens":
optional_params["max_tokens"] = value
if param == "n":
optional_params["num_generations"] = value
if param == "top_p":
optional_params["p"] = value
if param == "frequency_penalty":
optional_params["frequency_penalty"] = value
if param == "presence_penalty":
optional_params["presence_penalty"] = value
if param == "stop":
optional_params["stop_sequences"] = value
if param == "tools":
optional_params["tools"] = value
if param == "seed":
optional_params["seed"] = value
return optional_params
def transform_request(
self,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
headers: dict,
) -> dict:
## Load Config
for k, v in litellm.CohereChatConfig.get_config().items():
if (
k not in optional_params
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
most_recent_message, chat_history = cohere_messages_pt_v2(
messages=messages, model=model, llm_provider="cohere_chat"
)
## Handle Tool Calling
if "tools" in optional_params:
_is_function_call = True
cohere_tools = self._construct_cohere_tool(tools=optional_params["tools"])
optional_params["tools"] = cohere_tools
if isinstance(most_recent_message, dict):
optional_params["tool_results"] = [most_recent_message]
elif isinstance(most_recent_message, str):
optional_params["message"] = most_recent_message
## check if chat history message is 'user' and 'tool_results' is given -> force_single_step=True, else cohere api fails
if len(chat_history) > 0 and chat_history[-1]["role"] == "USER":
optional_params["force_single_step"] = True
return optional_params
def transform_response(
self,
model: str,
raw_response: httpx.Response,
model_response: ModelResponse,
logging_obj: LiteLLMLoggingObj,
request_data: dict,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
encoding: Any,
api_key: Optional[str] = None,
json_mode: Optional[bool] = None,
) -> ModelResponse:
try:
raw_response_json = raw_response.json()
except Exception:
raise CohereError(
message=raw_response.text, status_code=raw_response.status_code
)
try:
cohere_v2_chat_response = CohereV2ChatResponse(**raw_response_json) # type: ignore
except Exception:
raise CohereError(message=raw_response.text, status_code=422)
cohere_content = cohere_v2_chat_response["message"].get("content", None)
if cohere_content is not None:
model_response.choices[0].message.content = "".join( # type: ignore
[
content.get("text", "")
for content in cohere_content
if content is not None
]
)
## ADD CITATIONS
if "citations" in cohere_v2_chat_response:
setattr(model_response, "citations", cohere_v2_chat_response["citations"])
## Tool calling response
cohere_tools_response = cohere_v2_chat_response["message"].get("tool_calls", [])
if cohere_tools_response is not None and cohere_tools_response != []:
# convert cohere_tools_response to OpenAI response format
tool_calls: List[ChatCompletionToolCallChunk] = []
for index, tool in enumerate(cohere_tools_response):
tool_call: ChatCompletionToolCallChunk = {
**tool, # type: ignore
"index": index,
}
tool_calls.append(tool_call)
_message = litellm.Message(
tool_calls=tool_calls,
content=None,
)
model_response.choices[0].message = _message # type: ignore
## CALCULATING USAGE - use cohere `billed_units` for returning usage
token_usage = cohere_v2_chat_response["usage"].get("tokens", {})
prompt_tokens = token_usage.get("input_tokens", 0)
completion_tokens = token_usage.get("output_tokens", 0)
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
setattr(model_response, "usage", usage)
return model_response
def _construct_cohere_tool(
self,
tools: Optional[list] = None,
):
if tools is None:
tools = []
cohere_tools = []
for tool in tools:
cohere_tool = self._translate_openai_tool_to_cohere(tool)
cohere_tools.append(cohere_tool)
return cohere_tools
def _translate_openai_tool_to_cohere(
self,
openai_tool: dict,
):
# cohere tools look like this
"""
{
"name": "query_daily_sales_report",
"description": "Connects to a database to retrieve overall sales volumes and sales information for a given day.",
"parameter_definitions": {
"day": {
"description": "Retrieves sales data for this day, formatted as YYYY-MM-DD.",
"type": "str",
"required": True
}
}
}
"""
# OpenAI tools look like this
"""
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
"""
cohere_tool = {
"name": openai_tool["function"]["name"],
"description": openai_tool["function"]["description"],
"parameter_definitions": {},
}
for param_name, param_def in openai_tool["function"]["parameters"][
"properties"
].items():
required_params = (
openai_tool.get("function", {})
.get("parameters", {})
.get("required", [])
)
cohere_param_def = {
"description": param_def.get("description", ""),
"type": param_def.get("type", ""),
"required": param_name in required_params,
}
cohere_tool["parameter_definitions"][param_name] = cohere_param_def
return cohere_tool
def get_model_response_iterator(
self,
streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
sync_stream: bool,
json_mode: Optional[bool] = False,
):
return CohereModelResponseIterator(
streaming_response=streaming_response,
sync_stream=sync_stream,
json_mode=json_mode,
)
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:
return CohereError(status_code=status_code, message=error_message)

View file

@ -104,19 +104,28 @@ class ModelResponseIterator:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
str_line = chunk
if isinstance(chunk, bytes): # Handle binary data
str_line = chunk.decode("utf-8") # Convert bytes to string
index = str_line.find("data:")
if index != -1:
str_line = str_line[index:]
data_json = json.loads(str_line)
return self.chunk_parser(chunk=data_json)
return self.convert_str_chunk_to_generic_chunk(chunk=chunk)
except StopIteration:
raise StopIteration
except ValueError as e:
raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
def convert_str_chunk_to_generic_chunk(self, chunk: str) -> GenericStreamingChunk:
"""
Convert a string chunk to a GenericStreamingChunk
Note: This is used for Cohere pass through streaming logging
"""
str_line = chunk
if isinstance(chunk, bytes): # Handle binary data
str_line = chunk.decode("utf-8") # Convert bytes to string
index = str_line.find("data:")
if index != -1:
str_line = str_line[index:]
data_json = json.loads(str_line)
return self.chunk_parser(chunk=data_json)
# Async iterator
def __aiter__(self):
self.async_response_iterator = self.streaming_response.__aiter__()
@ -131,15 +140,7 @@ class ModelResponseIterator:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
str_line = chunk
if isinstance(chunk, bytes): # Handle binary data
str_line = chunk.decode("utf-8") # Convert bytes to string
index = str_line.find("data:")
if index != -1:
str_line = str_line[index:]
data_json = json.loads(str_line)
return self.chunk_parser(chunk=data_json)
return self.convert_str_chunk_to_generic_chunk(chunk=chunk)
except StopAsyncIteration:
raise StopAsyncIteration
except ValueError as e:

View file

@ -8,6 +8,7 @@ import httpx
from httpx import USE_CLIENT_DEFAULT, AsyncHTTPTransport, HTTPTransport
import litellm
from litellm.constants import _DEFAULT_TTL_FOR_HTTPX_CLIENTS
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
from litellm.types.llms.custom_http import *
@ -31,7 +32,6 @@ headers = {
# https://www.python-httpx.org/advanced/timeouts
_DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour
def mask_sensitive_info(error_message):
@ -650,6 +650,49 @@ class HTTPHandler:
except Exception as e:
raise e
def delete(
self,
url: str,
data: Optional[Union[dict, str]] = None, # type: ignore
json: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
stream: bool = False,
):
try:
if timeout is not None:
req = self.client.build_request(
"DELETE", url, data=data, json=json, params=params, headers=headers, timeout=timeout # type: ignore
)
else:
req = self.client.build_request(
"DELETE", url, data=data, json=json, params=params, headers=headers # type: ignore
)
response = self.client.send(req, stream=stream)
response.raise_for_status()
return response
except httpx.TimeoutException:
raise litellm.Timeout(
message=f"Connection timed out after {timeout} seconds.",
model="default-model-name",
llm_provider="litellm-httpx-handler",
)
except httpx.HTTPStatusError as e:
if stream is True:
setattr(e, "message", mask_sensitive_info(e.response.read()))
setattr(e, "text", mask_sensitive_info(e.response.read()))
else:
error_text = mask_sensitive_info(e.response.text)
setattr(e, "message", error_text)
setattr(e, "text", error_text)
setattr(e, "status_code", e.response.status_code)
raise e
except Exception as e:
raise e
def __del__(self) -> None:
try:
self.close()

View file

@ -11,6 +11,7 @@ from litellm._logging import verbose_logger
from litellm.llms.base_llm.audio_transcription.transformation import (
BaseAudioTranscriptionConfig,
)
from litellm.llms.base_llm.base_model_iterator import MockResponseIterator
from litellm.llms.base_llm.chat.transformation import BaseConfig
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
from litellm.llms.base_llm.files.transformation import BaseFilesConfig
@ -35,6 +36,7 @@ from litellm.types.llms.openai import (
ResponsesAPIResponse,
)
from litellm.types.rerank import OptionalRerankParams, RerankResponse
from litellm.types.responses.main import DeleteResponseResult
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
@ -228,11 +230,17 @@ class BaseLLMHTTPHandler:
api_key: Optional[str] = None,
headers: Optional[dict] = {},
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
provider_config: Optional[BaseConfig] = None,
):
json_mode: bool = optional_params.pop("json_mode", False)
extra_body: Optional[dict] = optional_params.pop("extra_body", None)
fake_stream = fake_stream or optional_params.pop("fake_stream", False)
provider_config = ProviderConfigManager.get_provider_chat_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
provider_config = (
provider_config
or ProviderConfigManager.get_provider_chat_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
)
)
if provider_config is None:
raise ValueError(
@ -267,6 +275,9 @@ class BaseLLMHTTPHandler:
headers=headers,
)
if extra_body is not None:
data = {**data, **extra_body}
headers = provider_config.sign_request(
headers=headers,
optional_params=optional_params,
@ -313,6 +324,7 @@ class BaseLLMHTTPHandler:
),
litellm_params=litellm_params,
json_mode=json_mode,
optional_params=optional_params,
)
else:
@ -374,6 +386,7 @@ class BaseLLMHTTPHandler:
),
litellm_params=litellm_params,
json_mode=json_mode,
optional_params=optional_params,
)
return CustomStreamWrapper(
completion_stream=completion_stream,
@ -422,6 +435,7 @@ class BaseLLMHTTPHandler:
model: str,
messages: list,
logging_obj,
optional_params: dict,
litellm_params: dict,
timeout: Union[float, httpx.Timeout],
fake_stream: bool = False,
@ -453,11 +467,22 @@ class BaseLLMHTTPHandler:
)
if fake_stream is True:
completion_stream = provider_config.get_model_response_iterator(
streaming_response=response.json(),
sync_stream=True,
model_response: ModelResponse = provider_config.transform_response(
model=model,
raw_response=response,
model_response=litellm.ModelResponse(),
logging_obj=logging_obj,
request_data=data,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
encoding=None,
json_mode=json_mode,
)
completion_stream: Any = MockResponseIterator(
model_response=model_response, json_mode=json_mode
)
else:
completion_stream = provider_config.get_model_response_iterator(
streaming_response=response.iter_lines(),
@ -487,6 +512,7 @@ class BaseLLMHTTPHandler:
logging_obj: LiteLLMLoggingObj,
data: dict,
litellm_params: dict,
optional_params: dict,
fake_stream: bool = False,
client: Optional[AsyncHTTPHandler] = None,
json_mode: Optional[bool] = None,
@ -505,6 +531,7 @@ class BaseLLMHTTPHandler:
)
completion_stream, _response_headers = await self.make_async_call_stream_helper(
model=model,
custom_llm_provider=custom_llm_provider,
provider_config=provider_config,
api_base=api_base,
@ -516,6 +543,8 @@ class BaseLLMHTTPHandler:
fake_stream=fake_stream,
client=client,
litellm_params=litellm_params,
optional_params=optional_params,
json_mode=json_mode,
)
streamwrapper = CustomStreamWrapper(
completion_stream=completion_stream,
@ -527,6 +556,7 @@ class BaseLLMHTTPHandler:
async def make_async_call_stream_helper(
self,
model: str,
custom_llm_provider: str,
provider_config: BaseConfig,
api_base: str,
@ -536,8 +566,10 @@ class BaseLLMHTTPHandler:
logging_obj: LiteLLMLoggingObj,
timeout: Union[float, httpx.Timeout],
litellm_params: dict,
optional_params: dict,
fake_stream: bool = False,
client: Optional[AsyncHTTPHandler] = None,
json_mode: Optional[bool] = None,
) -> Tuple[Any, httpx.Headers]:
"""
Helper function for making an async call with stream.
@ -568,8 +600,21 @@ class BaseLLMHTTPHandler:
)
if fake_stream is True:
completion_stream = provider_config.get_model_response_iterator(
streaming_response=response.json(), sync_stream=False
model_response: ModelResponse = provider_config.transform_response(
model=model,
raw_response=response,
model_response=litellm.ModelResponse(),
logging_obj=logging_obj,
request_data=data,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
encoding=None,
json_mode=json_mode,
)
completion_stream: Any = MockResponseIterator(
model_response=model_response, json_mode=json_mode
)
else:
completion_stream = provider_config.get_model_response_iterator(
@ -594,8 +639,12 @@ class BaseLLMHTTPHandler:
"""
Some providers like Bedrock invoke do not support the stream parameter in the request body, we only pass `stream` in the request body the provider supports it.
"""
if fake_stream is True:
return data
# remove 'stream' from data
new_data = data.copy()
new_data.pop("stream", None)
return new_data
if provider_config.supports_stream_param_in_request_body is True:
data["stream"] = True
return data
@ -967,6 +1016,7 @@ class BaseLLMHTTPHandler:
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
fake_stream: bool = False,
litellm_metadata: Optional[Dict[str, Any]] = None,
) -> Union[
ResponsesAPIResponse,
BaseResponsesAPIStreamingIterator,
@ -993,6 +1043,7 @@ class BaseLLMHTTPHandler:
timeout=timeout,
client=client if isinstance(client, AsyncHTTPHandler) else None,
fake_stream=fake_stream,
litellm_metadata=litellm_metadata,
)
if client is None or not isinstance(client, HTTPHandler):
@ -1011,9 +1062,12 @@ class BaseLLMHTTPHandler:
if extra_headers:
headers.update(extra_headers)
# Check if streaming is requested
stream = response_api_optional_request_params.get("stream", False)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
model=model,
litellm_params=dict(litellm_params),
)
data = responses_api_provider_config.transform_responses_api_request(
@ -1035,9 +1089,6 @@ class BaseLLMHTTPHandler:
},
)
# Check if streaming is requested
stream = response_api_optional_request_params.get("stream", False)
try:
if stream:
# For streaming, use stream=True in the request
@ -1061,6 +1112,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
return SyncResponsesAPIStreamingIterator(
@ -1068,6 +1121,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
else:
# For non-streaming requests
@ -1104,6 +1159,7 @@ class BaseLLMHTTPHandler:
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
fake_stream: bool = False,
litellm_metadata: Optional[Dict[str, Any]] = None,
) -> Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]:
"""
Async version of the responses API handler.
@ -1126,9 +1182,12 @@ class BaseLLMHTTPHandler:
if extra_headers:
headers.update(extra_headers)
# Check if streaming is requested
stream = response_api_optional_request_params.get("stream", False)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
model=model,
litellm_params=dict(litellm_params),
)
data = responses_api_provider_config.transform_responses_api_request(
@ -1149,8 +1208,6 @@ class BaseLLMHTTPHandler:
"headers": headers,
},
)
# Check if streaming is requested
stream = response_api_optional_request_params.get("stream", False)
try:
if stream:
@ -1177,6 +1234,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
# Return the streaming iterator
@ -1185,6 +1244,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
else:
# For non-streaming, proceed as before
@ -1208,6 +1269,163 @@ class BaseLLMHTTPHandler:
logging_obj=logging_obj,
)
async def async_delete_response_api_handler(
self,
response_id: str,
responses_api_provider_config: BaseResponsesAPIConfig,
litellm_params: GenericLiteLLMParams,
logging_obj: LiteLLMLoggingObj,
custom_llm_provider: Optional[str],
extra_headers: Optional[Dict[str, Any]] = None,
extra_body: Optional[Dict[str, Any]] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
) -> DeleteResponseResult:
"""
Async version of the delete response API handler.
Uses async HTTP client to make requests.
"""
if client is None or not isinstance(client, AsyncHTTPHandler):
async_httpx_client = get_async_httpx_client(
llm_provider=litellm.LlmProviders(custom_llm_provider),
params={"ssl_verify": litellm_params.get("ssl_verify", None)},
)
else:
async_httpx_client = client
headers = responses_api_provider_config.validate_environment(
api_key=litellm_params.api_key,
headers=extra_headers or {},
model="None",
)
if extra_headers:
headers.update(extra_headers)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
litellm_params=dict(litellm_params),
)
url, data = responses_api_provider_config.transform_delete_response_api_request(
response_id=response_id,
api_base=api_base,
litellm_params=litellm_params,
headers=headers,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key="",
additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
)
try:
response = await async_httpx_client.delete(
url=url, headers=headers, data=json.dumps(data), timeout=timeout
)
except Exception as e:
raise self._handle_error(
e=e,
provider_config=responses_api_provider_config,
)
return responses_api_provider_config.transform_delete_response_api_response(
raw_response=response,
logging_obj=logging_obj,
)
def delete_response_api_handler(
self,
response_id: str,
responses_api_provider_config: BaseResponsesAPIConfig,
litellm_params: GenericLiteLLMParams,
logging_obj: LiteLLMLoggingObj,
custom_llm_provider: Optional[str],
extra_headers: Optional[Dict[str, Any]] = None,
extra_body: Optional[Dict[str, Any]] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
) -> Union[DeleteResponseResult, Coroutine[Any, Any, DeleteResponseResult]]:
"""
Async version of the responses API handler.
Uses async HTTP client to make requests.
"""
if _is_async:
return self.async_delete_response_api_handler(
response_id=response_id,
responses_api_provider_config=responses_api_provider_config,
litellm_params=litellm_params,
logging_obj=logging_obj,
custom_llm_provider=custom_llm_provider,
extra_headers=extra_headers,
extra_body=extra_body,
timeout=timeout,
client=client,
)
if client is None or not isinstance(client, HTTPHandler):
sync_httpx_client = _get_httpx_client(
params={"ssl_verify": litellm_params.get("ssl_verify", None)}
)
else:
sync_httpx_client = client
headers = responses_api_provider_config.validate_environment(
api_key=litellm_params.api_key,
headers=extra_headers or {},
model="None",
)
if extra_headers:
headers.update(extra_headers)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
litellm_params=dict(litellm_params),
)
url, data = responses_api_provider_config.transform_delete_response_api_request(
response_id=response_id,
api_base=api_base,
litellm_params=litellm_params,
headers=headers,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key="",
additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
)
try:
response = sync_httpx_client.delete(
url=url, headers=headers, data=json.dumps(data), timeout=timeout
)
except Exception as e:
raise self._handle_error(
e=e,
provider_config=responses_api_provider_config,
)
return responses_api_provider_config.transform_delete_response_api_response(
raw_response=response,
logging_obj=logging_obj,
)
def create_file(
self,
create_file_data: CreateFileRequest,

View file

@ -37,6 +37,7 @@ from litellm.types.llms.databricks import (
)
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolChoiceFunctionParam,
ChatCompletionToolChoiceObjectParam,
@ -314,13 +315,24 @@ class DatabricksConfig(DatabricksBase, OpenAILikeChatConfig, AnthropicConfig):
@staticmethod
def extract_reasoning_content(
content: Optional[AllDatabricksContentValues],
) -> Tuple[Optional[str], Optional[List[ChatCompletionThinkingBlock]]]:
) -> Tuple[
Optional[str],
Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
],
]:
"""
Extract and return the reasoning content and thinking blocks
"""
if content is None:
return None, None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None
reasoning_content: Optional[str] = None
if isinstance(content, list):
for item in content:

View file

@ -1,15 +1,33 @@
from typing import List, Literal, Optional, Tuple, Union, cast
import json
import uuid
from typing import Any, List, Literal, Optional, Tuple, Union, cast
import httpx
import litellm
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.llm_response_utils.get_headers import (
get_response_headers,
)
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionImageObject,
ChatCompletionToolParam,
OpenAIChatCompletionToolParam,
)
from litellm.types.utils import ProviderSpecificModelInfo
from litellm.types.utils import (
ChatCompletionMessageToolCall,
Choices,
Function,
Message,
ModelResponse,
ProviderSpecificModelInfo,
)
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
from ..common_utils import FireworksAIException
class FireworksAIConfig(OpenAIGPTConfig):
@ -219,6 +237,94 @@ class FireworksAIConfig(OpenAIGPTConfig):
headers=headers,
)
def _handle_message_content_with_tool_calls(
self,
message: Message,
tool_calls: Optional[List[ChatCompletionToolParam]],
) -> Message:
"""
Fireworks AI sends tool calls in the content field instead of tool_calls
Relevant Issue: https://github.com/BerriAI/litellm/issues/7209#issuecomment-2813208780
"""
if (
tool_calls is not None
and message.content is not None
and message.tool_calls is None
):
try:
function = Function(**json.loads(message.content))
if function.name != RESPONSE_FORMAT_TOOL_NAME and function.name in [
tool["function"]["name"] for tool in tool_calls
]:
tool_call = ChatCompletionMessageToolCall(
function=function, id=str(uuid.uuid4()), type="function"
)
message.tool_calls = [tool_call]
message.content = None
except Exception:
pass
return message
def transform_response(
self,
model: str,
raw_response: httpx.Response,
model_response: ModelResponse,
logging_obj: LiteLLMLoggingObj,
request_data: dict,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
encoding: Any,
api_key: Optional[str] = None,
json_mode: Optional[bool] = None,
) -> ModelResponse:
## LOGGING
logging_obj.post_call(
input=messages,
api_key=api_key,
original_response=raw_response.text,
additional_args={"complete_input_dict": request_data},
)
## RESPONSE OBJECT
try:
completion_response = raw_response.json()
except Exception as e:
response_headers = getattr(raw_response, "headers", None)
raise FireworksAIException(
message="Unable to get json response - {}, Original Response: {}".format(
str(e), raw_response.text
),
status_code=raw_response.status_code,
headers=response_headers,
)
raw_response_headers = dict(raw_response.headers)
additional_headers = get_response_headers(raw_response_headers)
response = ModelResponse(**completion_response)
if response.model is not None:
response.model = "fireworks_ai/" + response.model
## FIREWORKS AI sends tool calls in the content field instead of tool_calls
for choice in response.choices:
cast(
Choices, choice
).message = self._handle_message_content_with_tool_calls(
message=cast(Choices, choice).message,
tool_calls=optional_params.get("tools", None),
)
response._hidden_params = {"additional_headers": additional_headers}
return response
def _get_openai_compatible_provider_info(
self, api_base: Optional[str], api_key: Optional[str]
) -> Tuple[Optional[str], Optional[str]]:

View file

@ -7,6 +7,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
)
from litellm.types.llms.openai import AllMessageValues
from litellm.types.llms.vertex_ai import ContentType, PartType
from litellm.utils import supports_reasoning
from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@ -67,7 +68,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
return super().get_config()
def get_supported_openai_params(self, model: str) -> List[str]:
return [
supported_params = [
"temperature",
"top_p",
"max_tokens",
@ -83,6 +84,10 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
"frequency_penalty",
"modalities",
]
if supports_reasoning(model):
supported_params.append("reasoning_effort")
supported_params.append("thinking")
return supported_params
def map_openai_params(
self,

View file

@ -14,10 +14,10 @@ from litellm.types.llms.openai import (
ChatCompletionToolParamFunctionChunk,
)
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
from ...openai_like.chat.transformation import OpenAILikeChatConfig
class GroqChatConfig(OpenAIGPTConfig):
class GroqChatConfig(OpenAILikeChatConfig):
frequency_penalty: Optional[int] = None
function_call: Optional[Union[str, dict]] = None
functions: Optional[list] = None
@ -57,6 +57,14 @@ class GroqChatConfig(OpenAIGPTConfig):
def get_config(cls):
return super().get_config()
def get_supported_openai_params(self, model: str) -> list:
base_params = super().get_supported_openai_params(model)
try:
base_params.remove("max_retries")
except ValueError:
pass
return base_params
def _transform_messages(self, messages: List[AllMessageValues], model: str) -> List:
for idx, message in enumerate(messages):
"""
@ -124,8 +132,11 @@ class GroqChatConfig(OpenAIGPTConfig):
optional_params: dict,
model: str,
drop_params: bool = False,
replace_max_completion_tokens_with_max_tokens: bool = False, # groq supports max_completion_tokens
) -> dict:
_response_format = non_default_params.get("response_format")
if self._should_fake_stream(non_default_params):
optional_params["fake_stream"] = True
if _response_format is not None and isinstance(_response_format, dict):
json_schema: Optional[dict] = None
if "response_schema" in _response_format:
@ -152,6 +163,8 @@ class GroqChatConfig(OpenAIGPTConfig):
non_default_params.pop(
"response_format", None
) # only remove if it's a json_schema - handled via using groq's tool calling params.
return super().map_openai_params(
optional_params = super().map_openai_params(
non_default_params, optional_params, model, drop_params
)
return optional_params

View file

@ -2,9 +2,19 @@
Translate from OpenAI's `/v1/chat/completions` to VLLM's `/v1/chat/completions`
"""
from typing import Optional, Tuple
from typing import List, Optional, Tuple, cast
from litellm.litellm_core_utils.prompt_templates.common_utils import (
_get_image_mime_type_from_url,
)
from litellm.litellm_core_utils.prompt_templates.factory import _parse_mime_type
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionFileObject,
ChatCompletionVideoObject,
ChatCompletionVideoUrlObject,
)
from ....utils import _remove_additional_properties, _remove_strict_from_schema
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
@ -38,3 +48,71 @@ class HostedVLLMChatConfig(OpenAIGPTConfig):
api_key or get_secret_str("HOSTED_VLLM_API_KEY") or "fake-api-key"
) # vllm does not require an api key
return api_base, dynamic_api_key
def _is_video_file(self, content_item: ChatCompletionFileObject) -> bool:
"""
Check if the file is a video
- format: video/<extension>
- file_data: base64 encoded video data
- file_id: infer mp4 from extension
"""
file = content_item.get("file", {})
format = file.get("format")
file_data = file.get("file_data")
file_id = file.get("file_id")
if content_item.get("type") != "file":
return False
if format and format.startswith("video/"):
return True
elif file_data:
mime_type = _parse_mime_type(file_data)
if mime_type and mime_type.startswith("video/"):
return True
elif file_id:
mime_type = _get_image_mime_type_from_url(file_id)
if mime_type and mime_type.startswith("video/"):
return True
return False
def _convert_file_to_video_url(
self, content_item: ChatCompletionFileObject
) -> ChatCompletionVideoObject:
file = content_item.get("file", {})
file_id = file.get("file_id")
file_data = file.get("file_data")
if file_id:
return ChatCompletionVideoObject(
type="video_url", video_url=ChatCompletionVideoUrlObject(url=file_id)
)
elif file_data:
return ChatCompletionVideoObject(
type="video_url", video_url=ChatCompletionVideoUrlObject(url=file_data)
)
raise ValueError("file_id or file_data is required")
def _transform_messages(
self, messages: List[AllMessageValues], model: str
) -> List[AllMessageValues]:
"""
Support translating video files from file_id or file_data to video_url
"""
for message in messages:
if message["role"] == "user":
message_content = message.get("content")
if message_content and isinstance(message_content, list):
replaced_content_items: List[
Tuple[int, ChatCompletionFileObject]
] = []
for idx, content_item in enumerate(message_content):
if content_item.get("type") == "file":
content_item = cast(ChatCompletionFileObject, content_item)
if self._is_video_file(content_item):
replaced_content_items.append((idx, content_item))
for idx, content_item in replaced_content_items:
message_content[idx] = self._convert_file_to_video_url(
content_item
)
transformed_messages = super()._transform_messages(messages, model)
return transformed_messages

View file

@ -1,10 +1,16 @@
from typing import Union
import httpx
from litellm.llms.base_llm.chat.transformation import BaseLLMException
class InfinityError(BaseLLMException):
def __init__(self, status_code, message):
def __init__(
self,
status_code: int,
message: str,
headers: Union[dict, httpx.Headers] = {}
):
self.status_code = status_code
self.message = message
self.request = httpx.Request(
@ -16,4 +22,5 @@ class InfinityError(BaseLLMException):
message=message,
request=self.request,
response=self.response,
headers=headers,
) # Call the base class constructor with the parameters it needs

View file

@ -0,0 +1,5 @@
"""
Infinity Embedding - uses `llm_http_handler.py` to make httpx requests
Request/Response transformation is handled in `transformation.py`
"""

View file

@ -0,0 +1,141 @@
from typing import List, Optional, Union
import httpx
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import AllEmbeddingInputValues, AllMessageValues
from litellm.types.utils import EmbeddingResponse, Usage
from ..common_utils import InfinityError
class InfinityEmbeddingConfig(BaseEmbeddingConfig):
"""
Reference: https://infinity.modal.michaelfeil.eu/docs
"""
def __init__(self) -> None:
pass
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
if api_base is None:
raise ValueError("api_base is required for Infinity embeddings")
# Remove trailing slashes and ensure clean base URL
api_base = api_base.rstrip("/")
if not api_base.endswith("/embeddings"):
api_base = f"{api_base}/embeddings"
return api_base
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
if api_key is None:
api_key = get_secret_str("INFINITY_API_KEY")
default_headers = {
"Authorization": f"Bearer {api_key}",
"accept": "application/json",
"Content-Type": "application/json",
}
# If 'Authorization' is provided in headers, it overrides the default.
if "Authorization" in headers:
default_headers["Authorization"] = headers["Authorization"]
# Merge other headers, overriding any default ones except Authorization
return {**default_headers, **headers}
def get_supported_openai_params(self, model: str) -> list:
return [
"encoding_format",
"modality",
"dimensions",
]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
"""
Map OpenAI params to Infinity params
Reference: https://infinity.modal.michaelfeil.eu/docs
"""
if "encoding_format" in non_default_params:
optional_params["encoding_format"] = non_default_params["encoding_format"]
if "modality" in non_default_params:
optional_params["modality"] = non_default_params["modality"]
if "dimensions" in non_default_params:
optional_params["output_dimension"] = non_default_params["dimensions"]
return optional_params
def transform_embedding_request(
self,
model: str,
input: AllEmbeddingInputValues,
optional_params: dict,
headers: dict,
) -> dict:
return {
"input": input,
"model": model,
**optional_params,
}
def transform_embedding_response(
self,
model: str,
raw_response: httpx.Response,
model_response: EmbeddingResponse,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None,
request_data: dict = {},
optional_params: dict = {},
litellm_params: dict = {},
) -> EmbeddingResponse:
try:
raw_response_json = raw_response.json()
except Exception:
raise InfinityError(
message=raw_response.text, status_code=raw_response.status_code
)
# model_response.usage
model_response.model = raw_response_json.get("model")
model_response.data = raw_response_json.get("data")
model_response.object = raw_response_json.get("object")
usage = Usage(
prompt_tokens=raw_response_json.get("usage", {}).get("prompt_tokens", 0),
total_tokens=raw_response_json.get("usage", {}).get("total_tokens", 0),
)
model_response.usage = usage
return model_response
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:
return InfinityError(
message=error_message, status_code=status_code, headers=headers
)

View file

@ -22,7 +22,7 @@ from litellm.types.rerank import (
RerankTokens,
)
from .common_utils import InfinityError
from ..common_utils import InfinityError
class InfinityRerankConfig(CohereRerankConfig):

View file

@ -13,6 +13,7 @@ class LiteLLMProxyChatConfig(OpenAIGPTConfig):
def get_supported_openai_params(self, model: str) -> List:
list = super().get_supported_openai_params(model)
list.append("thinking")
list.append("reasoning_effort")
return list
def _map_openai_params(

Some files were not shown because too many files have changed in this diff Show more