Merge branch 'main' into key-mask-bug-fix

This commit is contained in:
Nandini Bagga 2025-04-23 19:42:10 +09:00
commit f0e35e1361
221 changed files with 10366 additions and 1346 deletions

View file

@ -20,6 +20,8 @@ REPLICATE_API_TOKEN = ""
ANTHROPIC_API_KEY = ""
# Infisical
INFISICAL_TOKEN = ""
# INFINITY
INFINITY_API_KEY = ""
# Development Configs
LITELLM_MASTER_KEY = "sk-1234"

2
.gitignore vendored
View file

@ -86,3 +86,5 @@ litellm/proxy/db/migrations/0_init/migration.sql
litellm/proxy/db/migrations/*
litellm/proxy/migrations/*config.yaml
litellm/proxy/migrations/*
config.yaml
tests/litellm/litellm_core_utils/llm_cost_calc/log.txt

View file

@ -0,0 +1,83 @@
# 🖇️ AgentOps - LLM Observability Platform
:::tip
This is community maintained. Please make an issue if you run into a bug:
https://github.com/BerriAI/litellm
:::
[AgentOps](https://docs.agentops.ai) is an observability platform that enables tracing and monitoring of LLM calls, providing detailed insights into your AI operations.
## Using AgentOps with LiteLLM
LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily integrate AgentOps for comprehensive tracing and monitoring of your LLM operations.
### Integration
Use just a few lines of code to instantly trace your responses **across all providers** with AgentOps:
Get your AgentOps API Keys from https://app.agentops.ai/
```python
import litellm
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# Make your LLM calls as usual
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
Complete Code:
```python
import os
from litellm import completion
# Set env variables
os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# OpenAI call
response = completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
)
print(response)
```
### Configuration Options
The AgentOps integration can be configured through environment variables:
- `AGENTOPS_API_KEY` (str, optional): Your AgentOps API key
- `AGENTOPS_ENVIRONMENT` (str, optional): Deployment environment (defaults to "production")
- `AGENTOPS_SERVICE_NAME` (str, optional): Service name for tracing (defaults to "agentops")
### Advanced Usage
You can configure additional settings through environment variables:
```python
import os
# Configure AgentOps settings
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
os.environ["AGENTOPS_ENVIRONMENT"] = "staging"
os.environ["AGENTOPS_SERVICE_NAME"] = "my-service"
# Enable AgentOps tracing
litellm.success_callback = ["agentops"]
```
### Support
For issues or questions, please refer to:
- [AgentOps Documentation](https://docs.agentops.ai)
- [LiteLLM Documentation](https://docs.litellm.ai)

View file

@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ✅ | works across all integrations |
| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |

View file

@ -0,0 +1,217 @@
# Mistral
Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀
#### **Example Usage**
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
Supports **ALL** Mistral Endpoints (including streaming).
## Quick Start
Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post)
1. Add MISTRAL_API_KEY to your environment
```bash
export MISTRAL_API_KEY="sk-1234"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Mistral `/ocr` endpoint
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
## Examples
Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://api.mistral.ai/v1` | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $MISTRAL_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: OCR endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_API_KEY' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
#### Direct Mistral API Call
```bash
curl https://api.mistral.ai/v1/ocr \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${MISTRAL_API_KEY}" \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": "https://arxiv.org/pdf/2201.04234"
},
"include_image_base64": true
}'
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
#### Direct Mistral API Call
```bash
curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export MISTRAL_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -0,0 +1,185 @@
# VLLM
Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀
#### **Example Usage**
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
Supports **ALL** VLLM Endpoints (including streaming).
## Quick Start
Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html)
1. Add HOSTED VLLM API BASE to your environment
```bash
export HOSTED_VLLM_API_BASE="https://my-vllm-server.com"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the VLLM `/metrics` endpoint
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
## Examples
Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://my-vllm-server.com` | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $VLLM_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Metrics endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
```
#### Direct VLLM API Call
```bash
curl -L -X GET 'https://my-vllm-server.com/metrics' \
-H 'Content-Type: application/json' \
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
#### Direct VLLM API Call
```bash
curl -L -X POST 'https://my-vllm-server.com/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export HOSTED_VLLM_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -1011,8 +1011,7 @@ Expected Response:
| Supported Operations | `/v1/responses`|
| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) |
| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests |
| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) |
## Usage

View file

@ -39,14 +39,164 @@ response = completion(
- temperature
- top_p
- max_tokens
- max_completion_tokens
- stream
- tools
- tool_choice
- functions
- response_format
- n
- stop
- logprobs
- frequency_penalty
- modalities
- reasoning_content
**Anthropic Params**
- thinking (used to set max budget tokens across anthropic/gemini models)
[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70)
## Usage - Thinking / `reasoning_content`
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: gemini/gemini-2.5-flash-preview-04-17
api_key: os.environ/GEMINI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = litellm.completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "gemini/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
## Passing Gemini Specific Params
### Response schema

View file

@ -3,18 +3,17 @@ import TabItem from '@theme/TabItem';
# Infinity
| Property | Details |
|-------|-------|
| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip|
| Provider Route on LiteLLM | `infinity/` |
| Supported Operations | `/rerank` |
| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) |
| Property | Details |
| ------------------------- | ---------------------------------------------------------------------------------------------------------- |
| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip |
| Provider Route on LiteLLM | `infinity/` |
| Supported Operations | `/rerank`, `/embeddings` |
| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) |
## **Usage - LiteLLM Python SDK**
```python
from litellm import rerank
from litellm import rerank, embedding
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
@ -39,8 +38,8 @@ model_list:
- model_name: custom-infinity-rerank
litellm_params:
model: infinity/rerank
api_key: os.environ/INFINITY_API_KEY
api_base: https://localhost:8080
api_key: os.environ/INFINITY_API_KEY
```
Start litellm
@ -51,7 +50,9 @@ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Test request
## Test request:
### Rerank
```bash
curl http://0.0.0.0:4000/rerank \
@ -70,15 +71,14 @@ curl http://0.0.0.0:4000/rerank \
}'
```
#### Supported Cohere Rerank API Params
## Supported Cohere Rerank API Params
| Param | Type | Description |
|-------|-------|-------|
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
| Param | Type | Description |
| ------------------ | ----------- | ----------------------------------------------- |
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
### Usage - Return Documents
@ -138,6 +138,7 @@ response = rerank(
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
@ -161,7 +162,7 @@ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
3. Test it!
```bash
curl http://0.0.0.0:4000/rerank \
@ -179,6 +180,121 @@ curl http://0.0.0.0:4000/rerank \
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}'
```
</TabItem>
</Tabs>
## Embeddings
LiteLLM provides an OpenAI api compatible `/embeddings` endpoint for embedding calls.
**Setup**
Add this to your litellm proxy config.yaml
```yaml
model_list:
- model_name: custom-infinity-embedding
litellm_params:
model: infinity/provider/custom-embedding-v1
api_base: http://localhost:8080
api_key: os.environ/INFINITY_API_KEY
```
### Test request:
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
#### Supported Embedding API Params
| Param | Type | Description |
| ----------------- | ----------- | ----------------------------------------------------------- |
| `model` | `str` | The embedding model to use |
| `input` | `list[str]` | The text inputs to generate embeddings for |
| `encoding_format` | `str` | The format to return embeddings in (e.g. "float", "base64") |
| `modality` | `str` | The type of input (e.g. "text", "image", "audio") |
### Usage - Basic Examples
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import embedding
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = embedding(
model="infinity/bge-small",
input=["good morning from litellm"]
)
print(response.data[0]['embedding'])
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
</TabItem>
</Tabs>
### Usage - OpenAI Client
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="<LITELLM_MASTER_KEY>",
base_url="<LITELLM_URL>"
)
response = client.embeddings.create(
model="bge-small",
input=["The food was delicious and the waiter..."],
encoding_format="float"
)
print(response.data[0].embedding)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "bge-small",
"input": ["The food was delicious and the waiter..."],
"encoding_format": "float"
}'
```
</TabItem>
</Tabs>

View file

@ -163,6 +163,12 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` |
| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` |
| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` |
| o4-mini | `response = completion(model="o4-mini", messages=messages)` |
| o3-mini | `response = completion(model="o3-mini", messages=messages)` |
| o3 | `response = completion(model="o3", messages=messages)` |
| o1-mini | `response = completion(model="o1-mini", messages=messages)` |
| o1-preview | `response = completion(model="o1-preview", messages=messages)` |
| gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` |

View file

@ -542,6 +542,154 @@ print(resp)
```
### **Thinking / `reasoning_content`**
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
resp = completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: vertex_ai/gemini-2.5-flash-preview-04-17
vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"}
vertex_project: "project-id"
vertex_location: "us-central1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
#### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
response = litellm.completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "vertex_ai/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
### **Context Caching**
Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.).

View file

@ -161,6 +161,120 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
<Tabs>
<TabItem value="files_message" label="(Unified) Files Message">
Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type.
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "file", "file": {"file_id": video_url}},
```
2. Pass the video data as base64
```
{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "file",
"file": {
"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
]
# call vllm
os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co"
os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=messages,
)
# call gemini
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
response = completion(
model="gemini/gemini-1.5-flash", # pass the gemini model name
messages=messages,
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
- model_name: my-gemini-model
litellm_params:
model: gemini/gemini-1.5-flash # add gemini/ prefix to route as Google AI Studio provider
api_key: os.environ/GEMINI_API_KEY
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="video_url" label="(VLLM-specific) Video Message">
Use this to send a video url to VLLM in it's native message format (`video_url`).
There are two ways to send a video url to VLLM:
1. Pass the video url directly
@ -249,6 +363,10 @@ curl -X POST http://0.0.0.0:4000/chat/completions \
</Tabs>
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package`
### Using - `litellm.completion`

View file

@ -299,6 +299,9 @@ router_settings:
|------|-------------|
| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
| AGENTOPS_ENVIRONMENT | Environment for AgentOps logging integration
| AGENTOPS_API_KEY | API Key for AgentOps logging integration
| AGENTOPS_SERVICE_NAME | Service Name for AgentOps logging integration
| AISPEND_ACCOUNT_ID | Account ID for AI Spend
| AISPEND_API_KEY | API Key for AI Spend
| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access

View file

@ -0,0 +1,108 @@
# Model Discovery
Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models.
## Supported Models
- Fireworks AI
- OpenAI
- Gemini
- LiteLLM Proxy
- Topaz
- Anthropic
- XAI
- VLLM
- Vertex AI
### Usage
**1. Setup config.yaml**
```yaml
model_list:
- model_name: xai/*
litellm_params:
model: xai/*
api_key: os.environ/XAI_API_KEY
litellm_settings:
check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**3. Call `/v1/models`**
```bash
curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY"
```
Expected response
```json
{
"data": [
{
"id": "xai/grok-2-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-vision-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-vision-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-image-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
}
],
"object": "list"
}
```

View file

@ -16,6 +16,8 @@ Supported Providers:
- Vertex AI (Anthropic) (`vertexai/`)
- OpenRouter (`openrouter/`)
- XAI (`xai/`)
- Google AI Studio (`google/`)
- Vertex AI (`vertex_ai/`)
LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.
@ -23,7 +25,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
"thinking_blocks": [ # only returned for Anthropic models
{
"type": "thinking",
"thinking": "The capital of France is Paris.",

View file

@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http
| Fallbacks | ✅ | Works between supported models |
| Loadbalancing | ✅ | Works between supported models |
| Supported LiteLLM Versions | 1.63.8+ | |
| Supported LLM providers | `openai` | |
| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. |
## Usage
## Create a model response
### LiteLLM Python SDK
<Tabs>
<TabItem value="litellm-sdk" label="LiteLLM SDK">
<TabItem value="openai" label="OpenAI">
#### Non-streaming
```python showLineNumbers
```python showLineNumbers title="OpenAI Non-streaming Response"
import litellm
# Non-streaming response
response = litellm.responses(
model="o1-pro",
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
@ -38,12 +38,12 @@ print(response)
```
#### Streaming
```python showLineNumbers
```python showLineNumbers title="OpenAI Streaming Response"
import litellm
# Streaming response
response = litellm.responses(
model="o1-pro",
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
@ -53,58 +53,169 @@ for event in response:
```
</TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers
model_list:
- model_name: o1-pro
litellm_params:
model: openai/o1-pro
api_key: os.environ/OPENAI_API_KEY
```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
<TabItem value="anthropic" label="Anthropic">
#### Non-streaming
```python showLineNumbers
from openai import OpenAI
```python showLineNumbers title="Anthropic Non-streaming Response"
import litellm
import os
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Non-streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers
from openai import OpenAI
```python showLineNumbers title="Anthropic Streaming Response"
import litellm
import os
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Streaming response
response = client.responses.create(
model="o1-pro",
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
#### Non-streaming
```python showLineNumbers title="Vertex AI Non-streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Non-streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Non-streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Non-streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
#### Non-streaming
```python showLineNumbers title="Google AI Studio Non-streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Non-streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
@ -116,10 +227,407 @@ for event in response:
</TabItem>
</Tabs>
### LiteLLM Proxy with OpenAI SDK
## **Supported Providers**
First, set up and start your LiteLLM proxy server.
| Provider | Link to Usage |
|-------------|--------------------|
| OpenAI| [Usage](#usage) |
| Azure OpenAI| [Usage](../docs/providers/azure#responses-api) |
```bash title="Start LiteLLM Proxy Server"
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
<Tabs>
<TabItem value="openai" label="OpenAI">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="OpenAI Proxy Configuration"
model_list:
- model_name: openai/o1-pro
litellm_params:
model: openai/o1-pro
api_key: os.environ/OPENAI_API_KEY
```
#### Non-streaming
```python showLineNumbers title="OpenAI Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="OpenAI Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="openai/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="anthropic" label="Anthropic">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Anthropic Proxy Configuration"
model_list:
- model_name: anthropic/claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Anthropic Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Anthropic Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Vertex AI Proxy Configuration"
model_list:
- model_name: vertex_ai/gemini-1.5-pro
litellm_params:
model: vertex_ai/gemini-1.5-pro
vertex_project: your-gcp-project-id
vertex_location: us-central1
```
#### Non-streaming
```python showLineNumbers title="Vertex AI Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="AWS Bedrock Proxy Configuration"
model_list:
- model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
litellm_params:
model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
aws_region_name: us-west-2
```
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Google AI Studio Proxy Configuration"
model_list:
- model_name: gemini/gemini-1.5-flash
litellm_params:
model: gemini/gemini-1.5-flash
api_key: os.environ/GEMINI_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>
## Supported Responses API Parameters
| Provider | Supported Parameters |
|----------|---------------------|
| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
## Load Balancing with Routing Affinity
When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response.
#### Example Usage
<Tabs>
<TabItem value="python-sdk" label="Python SDK">
```python showLineNumbers title="Python SDK with Routing Affinity"
import litellm
# Set up router with multiple deployments of the same model
router = litellm.Router(
model_list=[
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-1",
"api_version": "2024-06-01",
"api_base": "https://endpoint1.openai.azure.com",
},
},
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-2",
"api_version": "2024-06-01",
"api_base": "https://endpoint2.openai.azure.com",
},
},
],
optional_pre_call_checks=["responses_api_deployment_check"],
)
# Initial request
response = await router.aresponses(
model="azure-gpt4-turbo",
input="Hello, who are you?",
truncation="auto",
)
# Store the response ID
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = await router.aresponses(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
truncation="auto",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
<TabItem value="proxy-server" label="Proxy Server">
#### 1. Setup routing affinity on proxy config.yaml
To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml.
```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity"
model_list:
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-1
api_version: 2024-06-01
api_base: https://endpoint1.openai.azure.com
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-2
api_version: 2024-06-01
api_base: https://endpoint2.openai.azure.com
router_settings:
optional_pre_call_checks: ["responses_api_deployment_check"]
```
#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy
```python showLineNumbers title="OpenAI Client with Proxy Server"
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:4000",
api_key="your-api-key"
)
# Initial request
response = client.responses.create(
model="azure-gpt4-turbo",
input="Hello, who are you?"
)
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = client.responses.create(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,74 @@
import Image from '@theme/IdealImage';
# SCIM with LiteLLM
Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM.
This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints.
### Supported SSO Providers for SCIM
Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints.
- Microsoft Entra ID (Azure AD)
- Okta
- Google Workspace
- OneLogin
- Keycloak
- Auth0
## 1. Get your SCIM Tenant URL and Bearer Token
On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints.
<Image img={require('../../img/scim_2.png')} style={{ width: '800px', height: 'auto' }} />
## 2. Connect your IDP to LiteLLM SCIM Endpoints
On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`.
On this page, paste in your litellm scim tenant url and bearer token.
Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints.
<Image img={require('../../img/scim_4.png')} style={{ width: '800px', height: 'auto' }} />
## 3. Test SCIM Connection
### 3.1 Assign the group to your LiteLLM Enterprise App
On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app
<Image img={require('../../img/msft_enterprise_app.png')} style={{ width: '800px', height: 'auto' }} />
<br />
<br />
Once you've selected your litellm app, click on `Users and Groups` > `Add user/group`
<Image img={require('../../img/msft_enterprise_assign_group.png')} style={{ width: '800px', height: 'auto' }} />
<br />
Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
<Image img={require('../../img/msft_enterprise_select_group.png')} style={{ width: '800px', height: 'auto' }} />
### 3.2 Sign in to LiteLLM UI via SSO
Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
<Image img={require('../../img/msft_sso_sign_in.png')} style={{ width: '800px', height: 'auto' }} />
### 3.3 Check the new team on LiteLLM UI
On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM.
<Image img={require('../../img/msft_auto_team.png')} style={{ width: '900px', height: 'auto' }} />

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 268 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 999 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 244 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 380 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 231 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 261 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 413 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

View file

@ -0,0 +1,153 @@
---
title: v1.67.0-stable - SCIM Integration
slug: v1.67.0-stable
date: 2025-04-19T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: ["sso", "unified_file_id", "cost_tracking", "security"]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
## Key Highlights
- **SCIM Integration**: Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning
- **Team and Tag based usage tracking**: You can now see usage and spend by team and tag at 1M+ spend logs.
- **Unified Responses API**: Support for calling Anthropic, Gemini, Groq, etc. via OpenAI's new Responses API.
Let's dive in.
## SCIM Integration
<Image img={require('../../img/scim_integration.png')}/>
This release adds SCIM support to LiteLLM. This allows your SSO provider (Okta, Azure AD, etc) to automatically create/delete users, teams, and memberships on LiteLLM. This means that when you remove a team on your SSO provider, your SSO provider will automatically delete the corresponding team on LiteLLM.
[Read more](../../docs/tutorials/scim_litellm)
## Team and Tag based usage tracking
<Image img={require('../../img/release_notes/new_team_usage_highlight.jpg')}/>
This release improves team and tag based usage tracking at 1m+ spend logs, making it easy to monitor your LLM API Spend in production. This covers:
- View **daily spend** by teams + tags
- View **usage / spend by key**, within teams
- View **spend by multiple tags**
- Allow **internal users** to view spend of teams they're a member of
[Read more](#management-endpoints--ui)
## Unified Responses API
This release allows you to call Azure OpenAI, Anthropic, AWS Bedrock, and Google Vertex AI models via the POST /v1/responses endpoint on LiteLLM. This means you can now use popular tools like [OpenAI Codex](https://docs.litellm.ai/docs/tutorials/openai_codex) with your own models.
<Image img={require('../../img/release_notes/unified_responses_api_rn.png')}/>
[Read more](https://docs.litellm.ai/docs/response_api)
## New Models / Updated Models
- **OpenAI**
1. gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing - [Get Started](../../docs/providers/openai#usage), [PR](https://github.com/BerriAI/litellm/pull/9990)
2. o4 - correctly map o4 to openai o_series model
- **Azure AI**
1. Phi-4 output cost per token fix - [PR](https://github.com/BerriAI/litellm/pull/9880)
2. Responses API support [Get Started](../../docs/providers/azure#azure-responses-api),[PR](https://github.com/BerriAI/litellm/pull/10116)
- **Anthropic**
1. redacted message thinking support - [Get Started](../../docs/providers/anthropic#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10129)
- **Cohere**
1. `/v2/chat` Passthrough endpoint support w/ cost tracking - [Get Started](../../docs/pass_through/cohere), [PR](https://github.com/BerriAI/litellm/pull/9997)
- **Azure**
1. Support azure tenant_id/client_id env vars - [Get Started](../../docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret), [PR](https://github.com/BerriAI/litellm/pull/9993)
2. Fix response_format check for 2025+ api versions - [PR](https://github.com/BerriAI/litellm/pull/9993)
3. Add gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing
- **VLLM**
1. Files - Support 'file' message type for VLLM video url's - [Get Started](../../docs/providers/vllm#send-video-url-to-vllm), [PR](https://github.com/BerriAI/litellm/pull/10129)
2. Passthrough - new `/vllm/` passthrough endpoint support [Get Started](../../docs/pass_through/vllm), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **Mistral**
1. new `/mistral` passthrough endpoint support [Get Started](../../docs/pass_through/mistral), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **AWS**
1. New mapped bedrock regions - [PR](https://github.com/BerriAI/litellm/pull/9430)
- **VertexAI / Google AI Studio**
1. Gemini - Response format - Retain schema field ordering for google gemini and vertex by specifying propertyOrdering - [Get Started](../../docs/providers/vertex#json-schema), [PR](https://github.com/BerriAI/litellm/pull/9828)
2. Gemini-2.5-flash - return reasoning content [Google AI Studio](../../docs/providers/gemini#usage---thinking--reasoning_content), [Vertex AI](../../docs/providers/vertex#thinking--reasoning_content)
3. Gemini-2.5-flash - pricing + model information [PR](https://github.com/BerriAI/litellm/pull/10125)
4. Passthrough - new `/vertex_ai/discovery` route - enables calling AgentBuilder API routes [Get Started](../../docs/pass_through/vertex_ai#supported-api-endpoints), [PR](https://github.com/BerriAI/litellm/pull/10084)
- **Fireworks AI**
1. return tool calling responses in `tool_calls` field (fireworks incorrectly returns this as a json str in content) [PR](https://github.com/BerriAI/litellm/pull/10130)
- **Triton**
1. Remove fixed remove bad_words / stop words from `/generate` call - [Get Started](../../docs/providers/triton-inference-server#triton-generate---chat-completion), [PR](https://github.com/BerriAI/litellm/pull/10163)
- **Other**
1. Support for all litellm providers on Responses API (works with Codex) - [Get Started](../../docs/tutorials/openai_codex), [PR](https://github.com/BerriAI/litellm/pull/10132)
2. Fix combining multiple tool calls in streaming response - [Get Started](../../docs/completion/stream#helper-function), [PR](https://github.com/BerriAI/litellm/pull/10040)
## Spend Tracking Improvements
- **Cost Control** - inject cache control points in prompt for cost reduction [Get Started](../../docs/tutorials/prompt_caching), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Spend Tags** - spend tags in headers - support x-litellm-tags even if tag based routing not enabled [Get Started](../../docs/proxy/request_headers#litellm-headers), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Gemini-2.5-flash** - support cost calculation for reasoning tokens [PR](https://github.com/BerriAI/litellm/pull/10141)
## Management Endpoints / UI
- **Users**
1. Show created_at and updated_at on users page - [PR](https://github.com/BerriAI/litellm/pull/10033)
- **Virtual Keys**
1. Filter by key alias - https://github.com/BerriAI/litellm/pull/10085
- **Usage Tab**
1. Team based usage
- New `LiteLLM_DailyTeamSpend` Table for aggregate team based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10039)
- New Team based usage dashboard + new `/team/daily/activity` API - [PR](https://github.com/BerriAI/litellm/pull/10081)
- Return team alias on /team/daily/activity API - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow internal user view spend for teams they belong to - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by team - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_team_usage.png')}/>
2. Tag Based Usage
- New `LiteLLM_DailyTagSpend` Table for aggregate tag based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10071)
- Restrict to only Proxy Admins - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by tag
- Return tags passed in request (i.e. dynamic tags) on `/tag/list` API - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_tag_usage.png')}/>
3. Track prompt caching metrics in daily user, team, tag tables - [PR](https://github.com/BerriAI/litellm/pull/10029)
4. Show usage by key (on all up, team, and tag usage dashboards) - [PR](https://github.com/BerriAI/litellm/pull/10157)
5. swap old usage with new usage tab
- **Models**
1. Make columns resizable/hideable - [PR](https://github.com/BerriAI/litellm/pull/10119)
- **API Playground**
1. Allow internal user to call api playground - [PR](https://github.com/BerriAI/litellm/pull/10157)
- **SCIM**
1. Add LiteLLM SCIM Integration for Team and User management - [Get Started](../../docs/tutorials/scim_litellm), [PR](https://github.com/BerriAI/litellm/pull/10072)
## Logging / Guardrail Integrations
- **GCS**
1. Fix gcs pub sub logging with env var GCS_PROJECT_ID - [Get Started](../../docs/observability/gcs_bucket_integration#usage), [PR](https://github.com/BerriAI/litellm/pull/10042)
- **AIM**
1. Add litellm call id passing to Aim guardrails on pre and post-hooks calls - [Get Started](../../docs/proxy/guardrails/aim_security), [PR](https://github.com/BerriAI/litellm/pull/10021)
- **Azure blob storage**
1. Ensure logging works in high throughput scenarios - [Get Started](../../docs/proxy/logging#azure-blob-storage), [PR](https://github.com/BerriAI/litellm/pull/9962)
## General Proxy Improvements
- **Support setting `litellm.modify_params` via env var** [PR](https://github.com/BerriAI/litellm/pull/9964)
- **Model Discovery** - Check providers `/models` endpoints when calling proxys `/v1/models` endpoint - [Get Started](../../docs/proxy/model_discovery), [PR](https://github.com/BerriAI/litellm/pull/9958)
- **`/utils/token_counter`** - fix retrieving custom tokenizer for db models - [Get Started](../../docs/proxy/configs#set-custom-tokenizer), [PR](https://github.com/BerriAI/litellm/pull/10047)
- **Prisma migrate** - handle existing columns in db table - [PR](https://github.com/BerriAI/litellm/pull/10138)

View file

@ -69,6 +69,7 @@ const sidebars = {
"proxy/clientside_auth",
"proxy/request_headers",
"proxy/response_headers",
"proxy/model_discovery",
],
},
{
@ -101,6 +102,7 @@ const sidebars = {
"proxy/admin_ui_sso",
"proxy/self_serve",
"proxy/public_teams",
"tutorials/scim_litellm",
"proxy/custom_sso",
"proxy/ui_credentials",
"proxy/ui_logs"
@ -330,6 +332,8 @@ const sidebars = {
"pass_through/vertex_ai",
"pass_through/google_ai_studio",
"pass_through/cohere",
"pass_through/vllm",
"pass_through/mistral",
"pass_through/openai_passthrough",
"pass_through/anthropic_completion",
"pass_through/bedrock",
@ -407,6 +411,7 @@ const sidebars = {
type: "category",
label: "Logging & Observability",
items: [
"observability/agentops_integration",
"observability/langfuse_integration",
"observability/lunary_integration",
"observability/mlflow",

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm-proxy-extras"
version = "0.1.10"
version = "0.1.11"
description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
authors = ["BerriAI"]
readme = "README.md"
@ -22,7 +22,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "0.1.10"
version = "0.1.11"
version_files = [
"pyproject.toml:version",
"../requirements.txt:litellm-proxy-extras==",

View file

@ -113,6 +113,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
"pagerduty",
"humanloop",
"gcs_pubsub",
"agentops",
"anthropic_cache_control_hook",
]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
@ -415,6 +416,7 @@ deepseek_models: List = []
azure_ai_models: List = []
jina_ai_models: List = []
voyage_models: List = []
infinity_models: List = []
databricks_models: List = []
cloudflare_models: List = []
codestral_models: List = []
@ -556,6 +558,8 @@ def add_known_models():
azure_ai_models.append(key)
elif value.get("litellm_provider") == "voyage":
voyage_models.append(key)
elif value.get("litellm_provider") == "infinity":
infinity_models.append(key)
elif value.get("litellm_provider") == "databricks":
databricks_models.append(key)
elif value.get("litellm_provider") == "cloudflare":
@ -644,6 +648,7 @@ model_list = (
+ deepseek_models
+ azure_ai_models
+ voyage_models
+ infinity_models
+ databricks_models
+ cloudflare_models
+ codestral_models
@ -699,6 +704,7 @@ models_by_provider: dict = {
"mistral": mistral_chat_models,
"azure_ai": azure_ai_models,
"voyage": voyage_models,
"infinity": infinity_models,
"databricks": databricks_models,
"cloudflare": cloudflare_models,
"codestral": codestral_models,
@ -946,6 +952,7 @@ from .llms.topaz.image_variations.transformation import TopazImageVariationConfi
from litellm.llms.openai.completion.transformation import OpenAITextCompletionConfig
from .llms.groq.chat.transformation import GroqChatConfig
from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
from .llms.infinity.embedding.transformation import InfinityEmbeddingConfig
from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
from .llms.mistral.mistral_chat_transformation import MistralConfig
from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig

View file

@ -304,6 +304,11 @@ def create_assistants(
"response_format": response_format,
}
# only send params that are not None
create_assistant_data = {
k: v for k, v in create_assistant_data.items() if v is not None
}
response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None
if custom_llm_provider == "openai":
api_base = (

View file

@ -21,6 +21,10 @@ DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
########## Networking constants ##############################################################
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour

View file

@ -45,6 +45,14 @@ class SpanAttributes:
"""
The name of the model being used.
"""
LLM_PROVIDER = "llm.provider"
"""
The provider of the model, such as OpenAI, Azure, Google, etc.
"""
LLM_SYSTEM = "llm.system"
"""
The AI product as identified by the client or server
"""
LLM_PROMPTS = "llm.prompts"
"""
Prompts provided to a completions API.
@ -65,15 +73,40 @@ class SpanAttributes:
"""
Number of tokens in the prompt.
"""
LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_WRITE = "llm.token_count.prompt_details.cache_write"
"""
Number of tokens in the prompt that were written to cache.
"""
LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_READ = "llm.token_count.prompt_details.cache_read"
"""
Number of tokens in the prompt that were read from cache.
"""
LLM_TOKEN_COUNT_PROMPT_DETAILS_AUDIO = "llm.token_count.prompt_details.audio"
"""
The number of audio input tokens presented in the prompt
"""
LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
"""
Number of tokens in the completion.
"""
LLM_TOKEN_COUNT_COMPLETION_DETAILS_REASONING = "llm.token_count.completion_details.reasoning"
"""
Number of tokens used for reasoning steps in the completion.
"""
LLM_TOKEN_COUNT_COMPLETION_DETAILS_AUDIO = "llm.token_count.completion_details.audio"
"""
The number of audio input tokens generated by the model
"""
LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
"""
Total number of tokens, including both prompt and completion.
"""
LLM_TOOLS = "llm.tools"
"""
List of tools that are advertised to the LLM to be able to call
"""
TOOL_NAME = "tool.name"
"""
Name of the tool being used.
@ -112,6 +145,19 @@ class SpanAttributes:
The id of the user
"""
PROMPT_VENDOR = "prompt.vendor"
"""
The vendor or origin of the prompt, e.g. a prompt library, a specialized service, etc.
"""
PROMPT_ID = "prompt.id"
"""
A vendor-specific id used to locate the prompt.
"""
PROMPT_URL = "prompt.url"
"""
A vendor-specific url used to locate the prompt.
"""
class MessageAttributes:
"""
@ -151,6 +197,10 @@ class MessageAttributes:
The JSON string representing the arguments passed to the function
during a function call.
"""
MESSAGE_TOOL_CALL_ID = "message.tool_call_id"
"""
The id of the tool call.
"""
class MessageContentAttributes:
@ -186,6 +236,25 @@ class ImageAttributes:
"""
class AudioAttributes:
"""
Attributes for audio
"""
AUDIO_URL = "audio.url"
"""
The url to an audio file
"""
AUDIO_MIME_TYPE = "audio.mime_type"
"""
The mime type of the audio file
"""
AUDIO_TRANSCRIPT = "audio.transcript"
"""
The transcript of the audio file
"""
class DocumentAttributes:
"""
Attributes for a document.
@ -257,6 +326,10 @@ class ToolCallAttributes:
Attributes for a tool call
"""
TOOL_CALL_ID = "tool_call.id"
"""
The id of the tool call.
"""
TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
"""
The name of function that is being called during a tool call.
@ -268,6 +341,18 @@ class ToolCallAttributes:
"""
class ToolAttributes:
"""
Attributes for a tools
"""
TOOL_JSON_SCHEMA = "tool.json_schema"
"""
The json schema of a tool input, It is RECOMMENDED that this be in the
OpenAI tool calling format: https://platform.openai.com/docs/assistants/tools
"""
class OpenInferenceSpanKindValues(Enum):
TOOL = "TOOL"
CHAIN = "CHAIN"
@ -284,3 +369,21 @@ class OpenInferenceSpanKindValues(Enum):
class OpenInferenceMimeTypeValues(Enum):
TEXT = "text/plain"
JSON = "application/json"
class OpenInferenceLLMSystemValues(Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
COHERE = "cohere"
MISTRALAI = "mistralai"
VERTEXAI = "vertexai"
class OpenInferenceLLMProviderValues(Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
COHERE = "cohere"
MISTRALAI = "mistralai"
GOOGLE = "google"
AZURE = "azure"
AWS = "aws"

View file

@ -0,0 +1,3 @@
from .agentops import AgentOps
__all__ = ["AgentOps"]

View file

@ -0,0 +1,118 @@
"""
AgentOps integration for LiteLLM - Provides OpenTelemetry tracing for LLM calls
"""
import os
from dataclasses import dataclass
from typing import Optional, Dict, Any
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@dataclass
class AgentOpsConfig:
endpoint: str = "https://otlp.agentops.cloud/v1/traces"
api_key: Optional[str] = None
service_name: Optional[str] = None
deployment_environment: Optional[str] = None
auth_endpoint: str = "https://api.agentops.ai/v3/auth/token"
@classmethod
def from_env(cls):
return cls(
endpoint="https://otlp.agentops.cloud/v1/traces",
api_key=os.getenv("AGENTOPS_API_KEY"),
service_name=os.getenv("AGENTOPS_SERVICE_NAME", "agentops"),
deployment_environment=os.getenv("AGENTOPS_ENVIRONMENT", "production"),
auth_endpoint="https://api.agentops.ai/v3/auth/token"
)
class AgentOps(OpenTelemetry):
"""
AgentOps integration - built on top of OpenTelemetry
Example usage:
```python
import litellm
litellm.success_callback = ["agentops"]
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
"""
def __init__(
self,
config: Optional[AgentOpsConfig] = None,
):
if config is None:
config = AgentOpsConfig.from_env()
# Prefetch JWT token for authentication
jwt_token = None
project_id = None
if config.api_key:
try:
response = self._fetch_auth_token(config.api_key, config.auth_endpoint)
jwt_token = response.get("token")
project_id = response.get("project_id")
except Exception:
pass
headers = f"Authorization=Bearer {jwt_token}" if jwt_token else None
otel_config = OpenTelemetryConfig(
exporter="otlp_http",
endpoint=config.endpoint,
headers=headers
)
# Initialize OpenTelemetry with our config
super().__init__(
config=otel_config,
callback_name="agentops"
)
# Set AgentOps-specific resource attributes
resource_attrs = {
"service.name": config.service_name or "litellm",
"deployment.environment": config.deployment_environment or "production",
"telemetry.sdk.name": "agentops",
}
if project_id:
resource_attrs["project.id"] = project_id
self.resource_attributes = resource_attrs
def _fetch_auth_token(self, api_key: str, auth_endpoint: str) -> Dict[str, Any]:
"""
Fetch JWT authentication token from AgentOps API
Args:
api_key: AgentOps API key
auth_endpoint: Authentication endpoint
Returns:
Dict containing JWT token and project ID
"""
headers = {
"Content-Type": "application/json",
"Connection": "keep-alive",
}
client = _get_httpx_client()
try:
response = client.post(
url=auth_endpoint,
headers=headers,
json={"api_key": api_key},
timeout=10
)
if response.status_code != 200:
raise Exception(f"Failed to fetch auth token: {response.text}")
return response.json()
finally:
client.close()

View file

@ -1,3 +1,4 @@
import json
from typing import TYPE_CHECKING, Any, Optional, Union
from litellm._logging import verbose_logger
@ -12,36 +13,141 @@ else:
Span = Any
def set_attributes(span: Span, kwargs, response_obj):
def cast_as_primitive_value_type(value) -> Union[str, bool, int, float]:
"""
Converts a value to an OTEL-supported primitive for Arize/Phoenix observability.
"""
if value is None:
return ""
if isinstance(value, (str, bool, int, float)):
return value
try:
return str(value)
except Exception:
return ""
def safe_set_attribute(span: Span, key: str, value: Any):
"""
Sets a span attribute safely with OTEL-compliant primitive typing for Arize/Phoenix.
"""
primitive_value = cast_as_primitive_value_type(value)
span.set_attribute(key, primitive_value)
def set_attributes(span: Span, kwargs, response_obj): # noqa: PLR0915
"""
Populates span with OpenInference-compliant LLM attributes for Arize and Phoenix tracing.
"""
from litellm.integrations._types.open_inference import (
MessageAttributes,
OpenInferenceSpanKindValues,
SpanAttributes,
ToolCallAttributes,
)
try:
optional_params = kwargs.get("optional_params", {})
litellm_params = kwargs.get("litellm_params", {})
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object"
)
if standard_logging_payload is None:
raise ValueError("standard_logging_object not found in kwargs")
#############################################
############ LLM CALL METADATA ##############
#############################################
if standard_logging_payload and (
metadata := standard_logging_payload["metadata"]
):
span.set_attribute(SpanAttributes.METADATA, safe_dumps(metadata))
# Set custom metadata for observability and trace enrichment.
metadata = (
standard_logging_payload.get("metadata")
if standard_logging_payload
else None
)
if metadata is not None:
safe_set_attribute(span, SpanAttributes.METADATA, safe_dumps(metadata))
#############################################
########## LLM Request Attributes ###########
#############################################
# The name of the LLM a request is being made to
# The name of the LLM a request is being made to.
if kwargs.get("model"):
span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
safe_set_attribute(
span,
SpanAttributes.LLM_MODEL_NAME,
kwargs.get("model"),
)
span.set_attribute(
# The LLM request type.
safe_set_attribute(
span,
"llm.request.type",
standard_logging_payload["call_type"],
)
# The Generative AI Provider: Azure, OpenAI, etc.
safe_set_attribute(
span,
SpanAttributes.LLM_PROVIDER,
litellm_params.get("custom_llm_provider", "Unknown"),
)
# The maximum number of tokens the LLM generates for a request.
if optional_params.get("max_tokens"):
safe_set_attribute(
span,
"llm.request.max_tokens",
optional_params.get("max_tokens"),
)
# The temperature setting for the LLM request.
if optional_params.get("temperature"):
safe_set_attribute(
span,
"llm.request.temperature",
optional_params.get("temperature"),
)
# The top_p sampling setting for the LLM request.
if optional_params.get("top_p"):
safe_set_attribute(
span,
"llm.request.top_p",
optional_params.get("top_p"),
)
# Indicates whether response is streamed.
safe_set_attribute(
span,
"llm.is_streaming",
str(optional_params.get("stream", False)),
)
# Logs the user ID if present.
if optional_params.get("user"):
safe_set_attribute(
span,
"llm.user",
optional_params.get("user"),
)
# The unique identifier for the completion.
if response_obj and response_obj.get("id"):
safe_set_attribute(span, "llm.response.id", response_obj.get("id"))
# The model used to generate the response.
if response_obj and response_obj.get("model"):
safe_set_attribute(
span,
"llm.response.model",
response_obj.get("model"),
)
# Required by OpenInference to mark span as LLM kind.
safe_set_attribute(
span,
SpanAttributes.OPENINFERENCE_SPAN_KIND,
OpenInferenceSpanKindValues.LLM.value,
)
@ -50,77 +156,132 @@ def set_attributes(span: Span, kwargs, response_obj):
# for /chat/completions
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
if messages:
span.set_attribute(
last_message = messages[-1]
safe_set_attribute(
span,
SpanAttributes.INPUT_VALUE,
messages[-1].get("content", ""), # get the last message for input
last_message.get("content", ""),
)
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page.
for idx, msg in enumerate(messages):
# Set the role per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
msg["role"],
prefix = f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}"
# Set the role per message.
safe_set_attribute(
span, f"{prefix}.{MessageAttributes.MESSAGE_ROLE}", msg.get("role")
)
# Set the content per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
# Set the content per message.
safe_set_attribute(
span,
f"{prefix}.{MessageAttributes.MESSAGE_CONTENT}",
msg.get("content", ""),
)
if standard_logging_payload and (
model_params := standard_logging_payload["model_parameters"]
):
# Capture tools (function definitions) used in the LLM call.
tools = optional_params.get("tools")
if tools:
for idx, tool in enumerate(tools):
function = tool.get("function")
if not function:
continue
prefix = f"{SpanAttributes.LLM_TOOLS}.{idx}"
safe_set_attribute(
span, f"{prefix}.{SpanAttributes.TOOL_NAME}", function.get("name")
)
safe_set_attribute(
span,
f"{prefix}.{SpanAttributes.TOOL_DESCRIPTION}",
function.get("description"),
)
safe_set_attribute(
span,
f"{prefix}.{SpanAttributes.TOOL_PARAMETERS}",
json.dumps(function.get("parameters")),
)
# Capture tool calls made during function-calling LLM flows.
functions = optional_params.get("functions")
if functions:
for idx, function in enumerate(functions):
prefix = f"{MessageAttributes.MESSAGE_TOOL_CALLS}.{idx}"
safe_set_attribute(
span,
f"{prefix}.{ToolCallAttributes.TOOL_CALL_FUNCTION_NAME}",
function.get("name"),
)
# Capture invocation parameters and user ID if available.
model_params = (
standard_logging_payload.get("model_parameters")
if standard_logging_payload
else None
)
if model_params:
# The Generative AI Provider: Azure, OpenAI, etc.
span.set_attribute(
SpanAttributes.LLM_INVOCATION_PARAMETERS, safe_dumps(model_params)
safe_set_attribute(
span,
SpanAttributes.LLM_INVOCATION_PARAMETERS,
safe_dumps(model_params),
)
if model_params.get("user"):
user_id = model_params.get("user")
if user_id is not None:
span.set_attribute(SpanAttributes.USER_ID, user_id)
safe_set_attribute(span, SpanAttributes.USER_ID, user_id)
#############################################
########## LLM Response Attributes ##########
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
#############################################
if hasattr(response_obj, "get"):
for choice in response_obj.get("choices", []):
response_message = choice.get("message", {})
span.set_attribute(
SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
)
# This shows up under `output_messages` tab on the span page
# This code assumes a single response
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
response_message.get("role"),
)
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
# Captures response tokens, message, and content.
if hasattr(response_obj, "get"):
for idx, choice in enumerate(response_obj.get("choices", [])):
response_message = choice.get("message", {})
safe_set_attribute(
span,
SpanAttributes.OUTPUT_VALUE,
response_message.get("content", ""),
)
usage = response_obj.get("usage")
# This shows up under `output_messages` tab on the span page.
prefix = f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.{idx}"
safe_set_attribute(
span,
f"{prefix}.{MessageAttributes.MESSAGE_ROLE}",
response_message.get("role"),
)
safe_set_attribute(
span,
f"{prefix}.{MessageAttributes.MESSAGE_CONTENT}",
response_message.get("content", ""),
)
# Token usage info.
usage = response_obj and response_obj.get("usage")
if usage:
span.set_attribute(
safe_set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
usage.get("total_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
safe_set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM prompt.
span.set_attribute(
safe_set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
usage.get("prompt_tokens"),
)
pass
except Exception as e:
verbose_logger.error(f"Error setting arize attributes: {e}")
verbose_logger.error(
f"[Arize/Phoenix] Failed to set OpenInference span attributes: {e}"
)
if hasattr(span, "record_exception"):
span.record_exception(e)

View file

@ -13,10 +13,15 @@ import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.datadog.datadog import DataDogLogger
from litellm.litellm_core_utils.prompt_templates.common_utils import (
handle_any_messages_to_chat_completion_str_messages_conversion,
)
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
@ -106,7 +111,6 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
},
)
response.raise_for_status()
if response.status_code != 202:
raise Exception(
f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
@ -116,6 +120,10 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
)
self.log_queue.clear()
except httpx.HTTPStatusError as e:
verbose_logger.exception(
f"DataDogLLMObs: Error sending batch - {e.response.text}"
)
except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
@ -133,7 +141,11 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
input_meta = InputMeta(messages=messages) # type: ignore
input_meta = InputMeta(
messages=handle_any_messages_to_chat_completion_str_messages_conversion(
messages
)
)
output_meta = OutputMeta(messages=self._get_response_messages(response_obj))
meta = Meta(

View file

@ -221,6 +221,8 @@ def get_supported_openai_params( # noqa: PLR0915
return litellm.PredibaseConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "voyage":
return litellm.VoyageEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "infinity":
return litellm.InfinityEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "triton":
if request_type == "embeddings":
return litellm.TritonEmbeddingConfig().get_supported_openai_params(

View file

@ -28,6 +28,7 @@ from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
@ -36,6 +37,7 @@ from litellm.cost_calculator import (
RealtimeAPITokenUsageProcessor,
_select_model_name_for_cost_calc,
)
from litellm.integrations.agentops import AgentOps
from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail
@ -2685,7 +2687,15 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
"""
try:
custom_logger_init_args = custom_logger_init_args or {}
if logging_integration == "lago":
if logging_integration == "agentops": # Add AgentOps initialization
for callback in _in_memory_loggers:
if isinstance(callback, AgentOps):
return callback # type: ignore
agentops_logger = AgentOps()
_in_memory_loggers.append(agentops_logger)
return agentops_logger # type: ignore
elif logging_integration == "lago":
for callback in _in_memory_loggers:
if isinstance(callback, LagoLogger):
return callback # type: ignore

View file

@ -265,8 +265,10 @@ def generic_cost_per_token(
)
## CALCULATE OUTPUT COST
text_tokens = usage.completion_tokens
text_tokens = 0
audio_tokens = 0
reasoning_tokens = 0
is_text_tokens_total = False
if usage.completion_tokens_details is not None:
audio_tokens = (
cast(
@ -280,9 +282,20 @@ def generic_cost_per_token(
Optional[int],
getattr(usage.completion_tokens_details, "text_tokens", None),
)
or usage.completion_tokens # default to completion tokens, if this field is not set
or 0 # default to completion tokens, if this field is not set
)
reasoning_tokens = (
cast(
Optional[int],
getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
)
or 0
)
if text_tokens == 0:
text_tokens = usage.completion_tokens
if text_tokens == usage.completion_tokens:
is_text_tokens_total = True
## TEXT COST
completion_cost = float(text_tokens) * completion_base_cost
@ -290,12 +303,26 @@ def generic_cost_per_token(
"output_cost_per_audio_token"
)
_output_cost_per_reasoning_token: Optional[float] = model_info.get(
"output_cost_per_reasoning_token"
)
## AUDIO COST
if (
_output_cost_per_audio_token is not None
and audio_tokens is not None
and audio_tokens > 0
):
if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0:
_output_cost_per_audio_token = (
_output_cost_per_audio_token
if _output_cost_per_audio_token is not None
else completion_base_cost
)
completion_cost += float(audio_tokens) * _output_cost_per_audio_token
## REASONING COST
if not is_text_tokens_total and reasoning_tokens and reasoning_tokens > 0:
_output_cost_per_reasoning_token = (
_output_cost_per_reasoning_token
if _output_cost_per_reasoning_token is not None
else completion_base_cost
)
completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
return prompt_cost, completion_cost

View file

@ -14,6 +14,7 @@ from litellm.types.llms.openai import ChatCompletionThinkingBlock
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall,
ChatCompletionRedactedThinkingBlock,
Choices,
Delta,
EmbeddingResponse,
@ -486,7 +487,14 @@ def convert_to_model_response_object( # noqa: PLR0915
)
# Handle thinking models that display `thinking_blocks` within `content`
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock,
ChatCompletionRedactedThinkingBlock,
]
]
] = None
if "thinking_blocks" in choice["message"]:
thinking_blocks = choice["message"]["thinking_blocks"]
provider_specific_fields["thinking_blocks"] = thinking_blocks

View file

@ -75,6 +75,10 @@ class ModelParamHelper:
combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs
@staticmethod
def get_litellm_provider_specific_params_for_chat_params() -> Set[str]:
return set(["thinking"])
@staticmethod
def _get_litellm_supported_chat_completion_kwargs() -> Set[str]:
"""
@ -82,11 +86,18 @@ class ModelParamHelper:
This follows the OpenAI API Spec
"""
all_chat_completion_kwargs = set(
non_streaming_params: Set[str] = set(
getattr(CompletionCreateParamsNonStreaming, "__annotations__", {}).keys()
).union(
set(getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys())
)
streaming_params: Set[str] = set(
getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys()
)
litellm_provider_specific_params: Set[str] = (
ModelParamHelper.get_litellm_provider_specific_params_for_chat_params()
)
all_chat_completion_kwargs: Set[str] = non_streaming_params.union(
streaming_params
).union(litellm_provider_specific_params)
return all_chat_completion_kwargs
@staticmethod

View file

@ -6,7 +6,7 @@ import io
import mimetypes
import re
from os import PathLike
from typing import Dict, List, Literal, Mapping, Optional, Union, cast
from typing import Any, Dict, List, Literal, Mapping, Optional, Union, cast
from litellm.types.llms.openai import (
AllMessageValues,
@ -32,6 +32,35 @@ DEFAULT_ASSISTANT_CONTINUE_MESSAGE = ChatCompletionAssistantMessage(
)
def handle_any_messages_to_chat_completion_str_messages_conversion(
messages: Any,
) -> List[Dict[str, str]]:
"""
Handles any messages to chat completion str messages conversion
Relevant Issue: https://github.com/BerriAI/litellm/issues/9494
"""
import json
if isinstance(messages, list):
try:
return cast(
List[Dict[str, str]],
handle_messages_with_content_list_to_str_conversion(messages),
)
except Exception:
return [{"input": json.dumps(message, default=str)} for message in messages]
elif isinstance(messages, dict):
try:
return [{"input": json.dumps(messages, default=str)}]
except Exception:
return [{"input": str(messages)}]
elif isinstance(messages, str):
return [{"input": messages}]
else:
return [{"input": str(messages)}]
def handle_messages_with_content_list_to_str_conversion(
messages: List[AllMessageValues],
) -> List[AllMessageValues]:
@ -471,3 +500,59 @@ def unpack_defs(schema, defs):
unpack_defs(ref, defs)
value["items"] = ref
continue
def _get_image_mime_type_from_url(url: str) -> Optional[str]:
"""
Get mime type for common image URLs
See gemini mime types: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#image-requirements
Supported by Gemini:
application/pdf
audio/mpeg
audio/mp3
audio/wav
image/png
image/jpeg
image/webp
text/plain
video/mov
video/mpeg
video/mp4
video/mpg
video/avi
video/wmv
video/mpegps
video/flv
"""
url = url.lower()
# Map file extensions to mime types
mime_types = {
# Images
(".jpg", ".jpeg"): "image/jpeg",
(".png",): "image/png",
(".webp",): "image/webp",
# Videos
(".mp4",): "video/mp4",
(".mov",): "video/mov",
(".mpeg", ".mpg"): "video/mpeg",
(".avi",): "video/avi",
(".wmv",): "video/wmv",
(".mpegps",): "video/mpegps",
(".flv",): "video/flv",
# Audio
(".mp3",): "audio/mp3",
(".wav",): "audio/wav",
(".mpeg",): "audio/mpeg",
# Documents
(".pdf",): "application/pdf",
(".txt",): "text/plain",
}
# Check each extension group against the URL
for extensions, mime_type in mime_types.items():
if any(url.endswith(ext) for ext in extensions):
return mime_type
return None

View file

@ -2258,6 +2258,14 @@ def _parse_content_type(content_type: str) -> str:
return m.get_content_type()
def _parse_mime_type(base64_data: str) -> Optional[str]:
mime_type_match = re.match(r"data:(.*?);base64", base64_data)
if mime_type_match:
return mime_type_match.group(1)
else:
return None
class BedrockImageProcessor:
"""Handles both sync and async image processing for Bedrock conversations."""

View file

@ -29,6 +29,7 @@ from litellm.types.llms.anthropic import (
UsageDelta,
)
from litellm.types.llms.openai import (
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
)
@ -501,18 +502,19 @@ class ModelResponseIterator:
) -> Tuple[
str,
Optional[ChatCompletionToolCallChunk],
List[ChatCompletionThinkingBlock],
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]],
Dict[str, Any],
]:
"""
Helper function to handle the content block delta
"""
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
provider_specific_fields = {}
content_block = ContentBlockDelta(**chunk) # type: ignore
thinking_blocks: List[ChatCompletionThinkingBlock] = []
thinking_blocks: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
self.content_blocks.append(content_block)
if "text" in content_block["delta"]:
@ -541,20 +543,25 @@ class ModelResponseIterator:
)
]
provider_specific_fields["thinking_blocks"] = thinking_blocks
return text, tool_use, thinking_blocks, provider_specific_fields
def _handle_reasoning_content(
self, thinking_blocks: List[ChatCompletionThinkingBlock]
self,
thinking_blocks: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
],
) -> Optional[str]:
"""
Handle the reasoning content
"""
reasoning_content = None
for block in thinking_blocks:
thinking_content = cast(Optional[str], block.get("thinking"))
if reasoning_content is None:
reasoning_content = ""
if "thinking" in block:
reasoning_content += block["thinking"]
if thinking_content is not None:
reasoning_content += thinking_content
return reasoning_content
def chunk_parser(self, chunk: dict) -> ModelResponseStream:
@ -567,7 +574,13 @@ class ModelResponseIterator:
usage: Optional[Usage] = None
provider_specific_fields: Dict[str, Any] = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
index = int(chunk.get("index", 0))
if type_chunk == "content_block_delta":
@ -605,6 +618,15 @@ class ModelResponseIterator:
},
"index": self.tool_index,
}
elif (
content_block_start["content_block"]["type"] == "redacted_thinking"
):
thinking_blocks = [
ChatCompletionRedactedThinkingBlock(
type="redacted_thinking",
data=content_block_start["content_block"]["data"],
)
]
elif type_chunk == "content_block_stop":
ContentBlockStop(**chunk) # type: ignore
# check if tool call content block

View file

@ -7,6 +7,9 @@ import httpx
import litellm
from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -27,6 +30,7 @@ from litellm.types.llms.openai import (
REASONING_EFFORT,
AllMessageValues,
ChatCompletionCachedContent,
ChatCompletionRedactedThinkingBlock,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
@ -276,11 +280,20 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
if reasoning_effort is None:
return None
elif reasoning_effort == "low":
return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
)
elif reasoning_effort == "medium":
return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
elif reasoning_effort == "high":
return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
)
else:
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
@ -563,13 +576,21 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
) -> Tuple[
str,
Optional[List[Any]],
Optional[List[ChatCompletionThinkingBlock]],
Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
],
Optional[str],
List[ChatCompletionToolCallChunk],
]:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []
for idx, content in enumerate(completion_response["content"]):
@ -588,20 +609,30 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
index=idx,
)
)
## CITATIONS
if content.get("citations", None) is not None:
if citations is None:
citations = []
citations.append(content["citations"])
if content.get("thinking", None) is not None:
elif content.get("thinking", None) is not None:
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
elif content["type"] == "redacted_thinking":
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(
cast(ChatCompletionRedactedThinkingBlock, content)
)
## CITATIONS
if content.get("citations") is not None:
if citations is None:
citations = []
citations.append(content["citations"])
if thinking_blocks is not None:
reasoning_content = ""
for block in thinking_blocks:
if "thinking" in block:
reasoning_content += block["thinking"]
thinking_content = cast(Optional[str], block.get("thinking"))
if thinking_content is not None:
reasoning_content += thinking_content
return text_content, citations, thinking_blocks, reasoning_content, tool_calls
def calculate_usage(
@ -691,7 +722,13 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
else:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []

View file

@ -1,6 +1,6 @@
"""
- call /messages on Anthropic API
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Ensure requests are logged in the DB - stream + non-stream
"""
@ -43,7 +43,9 @@ class AnthropicMessagesHandler:
from litellm.proxy.pass_through_endpoints.success_handler import (
PassThroughEndpointLogging,
)
from litellm.proxy.pass_through_endpoints.types import EndpointType
from litellm.types.passthrough_endpoints.pass_through_endpoints import (
EndpointType,
)
# Create success handler object
passthrough_success_handler_obj = PassThroughEndpointLogging()
@ -98,11 +100,11 @@ async def anthropic_messages(
api_base=optional_params.api_base,
api_key=optional_params.api_key,
)
anthropic_messages_provider_config: Optional[
BaseAnthropicMessagesConfig
] = ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(_custom_llm_provider),
anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = (
ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(_custom_llm_provider),
)
)
if anthropic_messages_provider_config is None:
raise ValueError(

View file

@ -288,6 +288,7 @@ class AzureAssistantsAPI(BaseAzureLLM):
timeout=timeout,
max_retries=max_retries,
client=client,
litellm_params=litellm_params,
)
thread_message: OpenAIMessage = openai_client.beta.threads.messages.create( # type: ignore

View file

@ -1,11 +1,14 @@
from typing import TYPE_CHECKING, Any, Optional, cast
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, cast
import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import *
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
from litellm.utils import _add_path_to_api_base
if TYPE_CHECKING:
@ -41,11 +44,7 @@ class AzureOpenAIResponsesAPIConfig(OpenAIResponsesAPIConfig):
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
Constructs a complete URL for the API request.
@ -92,3 +91,48 @@ class AzureOpenAIResponsesAPIConfig(OpenAIResponsesAPIConfig):
final_url = httpx.URL(new_url).copy_with(params=query_params)
return str(final_url)
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
"""
Transform the delete response API request into a URL and data
Azure OpenAI API expects the following request:
- DELETE /openai/responses/{response_id}?api-version=xxx
This function handles URLs with query parameters by inserting the response_id
at the correct location (before any query parameters).
"""
from urllib.parse import urlparse, urlunparse
# Parse the URL to separate its components
parsed_url = urlparse(api_base)
# Insert the response_id at the end of the path component
# Remove trailing slash if present to avoid double slashes
path = parsed_url.path.rstrip("/")
new_path = f"{path}/{response_id}"
# Reconstruct the URL with all original components but with the modified path
delete_url = urlunparse(
(
parsed_url.scheme, # http, https
parsed_url.netloc, # domain name, port
new_path, # path with response_id added
parsed_url.params, # parameters
parsed_url.query, # query string
parsed_url.fragment, # fragment
)
)
data: Dict = {}
verbose_logger.debug(f"delete response url={delete_url}")
return delete_url, data

View file

@ -1,3 +1,4 @@
import enum
from typing import Any, List, Optional, Tuple, cast
from urllib.parse import urlparse
@ -19,6 +20,10 @@ from litellm.types.utils import ModelResponse, ProviderField
from litellm.utils import _add_path_to_api_base, supports_tool_choice
class AzureFoundryErrorStrings(str, enum.Enum):
SET_EXTRA_PARAMETERS_TO_PASS_THROUGH = "Set extra-parameters to 'pass-through'"
class AzureAIStudioConfig(OpenAIConfig):
def get_supported_openai_params(self, model: str) -> List:
model_supports_tool_choice = True # azure ai supports this by default
@ -240,12 +245,18 @@ class AzureAIStudioConfig(OpenAIConfig):
) -> bool:
should_drop_params = litellm_params.get("drop_params") or litellm.drop_params
error_text = e.response.text
if should_drop_params and "Extra inputs are not permitted" in error_text:
return True
elif (
"unknown field: parameter index is not a valid field" in error_text
): # remove index from tool calls
return True
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in error_text
): # remove extra-parameters from tool calls
return True
return super().should_retry_llm_api_inside_llm_translation_on_http_error(
e=e, litellm_params=litellm_params
)
@ -265,5 +276,46 @@ class AzureAIStudioConfig(OpenAIConfig):
litellm.remove_index_from_tool_calls(
messages=_messages,
)
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in e.response.text
):
request_data = self._drop_extra_params_from_request_data(
request_data, e.response.text
)
data = drop_params_from_unprocessable_entity_error(e=e, data=request_data)
return data
def _drop_extra_params_from_request_data(
self, request_data: dict, error_text: str
) -> dict:
params_to_drop = self._extract_params_to_drop_from_error_text(error_text)
if params_to_drop:
for param in params_to_drop:
if param in request_data:
request_data.pop(param, None)
return request_data
def _extract_params_to_drop_from_error_text(
self, error_text: str
) -> Optional[List[str]]:
"""
Error text looks like this"
"Extra parameters ['stream_options', 'extra-parameters'] are not allowed when extra-parameters is not set or set to be 'error'.
"""
import re
# Extract parameters within square brackets
match = re.search(r"\[(.*?)\]", error_text)
if not match:
return []
# Parse the extracted string into a list of parameter names
params_str = match.group(1)
params = []
for param in params_str.split(","):
# Clean up the parameter name (remove quotes, spaces)
clean_param = param.strip().strip("'").strip('"')
if clean_param:
params.append(clean_param)
return params

View file

@ -1,6 +1,6 @@
import types
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
import httpx
@ -10,6 +10,7 @@ from litellm.types.llms.openai import (
ResponsesAPIResponse,
ResponsesAPIStreamingResponse,
)
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
if TYPE_CHECKING:
@ -73,11 +74,7 @@ class BaseResponsesAPIConfig(ABC):
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
OPTIONAL
@ -122,6 +119,31 @@ class BaseResponsesAPIConfig(ABC):
"""
pass
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
@abstractmethod
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
pass
@abstractmethod
def transform_delete_response_api_response(
self,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> DeleteResponseResult:
pass
#########################################################
########## END DELETE RESPONSE API TRANSFORMATION ##########
#########################################################
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:

View file

@ -22,6 +22,7 @@ from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMExcepti
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionRedactedThinkingBlock,
ChatCompletionResponseMessage,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
@ -375,25 +376,27 @@ class AmazonConverseConfig(BaseConfig):
system_content_blocks: List[SystemContentBlock] = []
for idx, message in enumerate(messages):
if message["role"] == "system":
_system_content_block: Optional[SystemContentBlock] = None
_cache_point_block: Optional[SystemContentBlock] = None
if isinstance(message["content"], str) and len(message["content"]) > 0:
_system_content_block = SystemContentBlock(text=message["content"])
_cache_point_block = self._get_cache_point_block(
system_prompt_indices.append(idx)
if isinstance(message["content"], str) and message["content"]:
system_content_blocks.append(
SystemContentBlock(text=message["content"])
)
cache_block = self._get_cache_point_block(
message, block_type="system"
)
if cache_block:
system_content_blocks.append(cache_block)
elif isinstance(message["content"], list):
for m in message["content"]:
if m.get("type", "") == "text" and len(m["text"]) > 0:
_system_content_block = SystemContentBlock(text=m["text"])
_cache_point_block = self._get_cache_point_block(
if m.get("type") == "text" and m.get("text"):
system_content_blocks.append(
SystemContentBlock(text=m["text"])
)
cache_block = self._get_cache_point_block(
m, block_type="system"
)
if _system_content_block is not None:
system_content_blocks.append(_system_content_block)
if _cache_point_block is not None:
system_content_blocks.append(_cache_point_block)
system_prompt_indices.append(idx)
if cache_block:
system_content_blocks.append(cache_block)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)
@ -627,9 +630,11 @@ class AmazonConverseConfig(BaseConfig):
def _transform_thinking_blocks(
self, thinking_blocks: List[BedrockConverseReasoningContentBlock]
) -> List[ChatCompletionThinkingBlock]:
) -> List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]:
"""Return a consistent format for thinking blocks between Anthropic and Bedrock."""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
thinking_blocks_list: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
for block in thinking_blocks:
if "reasoningText" in block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
@ -640,6 +645,11 @@ class AmazonConverseConfig(BaseConfig):
if _signature is not None:
_thinking_block["signature"] = _signature
thinking_blocks_list.append(_thinking_block)
elif "redactedContent" in block:
_redacted_block = ChatCompletionRedactedThinkingBlock(
type="redacted_thinking", data=block["redactedContent"]
)
thinking_blocks_list.append(_redacted_block)
return thinking_blocks_list
def _transform_usage(self, usage: ConverseTokenUsageBlock) -> Usage:

View file

@ -50,6 +50,7 @@ from litellm.llms.custom_httpx.http_handler import (
)
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
@ -1255,19 +1256,33 @@ class AWSEventStreamDecoder:
def translate_thinking_blocks(
self, thinking_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[List[ChatCompletionThinkingBlock]]:
) -> Optional[
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]
]:
"""
Translate the thinking blocks to a string
"""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
thinking_blocks_list: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
_thinking_block: Optional[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = None
if "text" in thinking_block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
_thinking_block["thinking"] = thinking_block["text"]
elif "signature" in thinking_block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
_thinking_block["signature"] = thinking_block["signature"]
_thinking_block["thinking"] = "" # consistent with anthropic response
thinking_blocks_list.append(_thinking_block)
elif "redactedContent" in thinking_block:
_thinking_block = ChatCompletionRedactedThinkingBlock(
type="redacted_thinking", data=thinking_block["redactedContent"]
)
if _thinking_block is not None:
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
@ -1279,31 +1294,44 @@ class AWSEventStreamDecoder:
usage: Optional[Usage] = None
provider_specific_fields: dict = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
index = int(chunk_data.get("contentBlockIndex", 0))
if "start" in chunk_data:
start_obj = ContentBlockStartEvent(**chunk_data["start"])
self.content_blocks = [] # reset
if (
start_obj is not None
and "toolUse" in start_obj
and start_obj["toolUse"] is not None
):
## check tool name was formatted by litellm
_response_tool_name = start_obj["toolUse"]["name"]
response_tool_name = get_bedrock_tool_name(
response_tool_name=_response_tool_name
)
tool_use = {
"id": start_obj["toolUse"]["toolUseId"],
"type": "function",
"function": {
"name": response_tool_name,
"arguments": "",
},
"index": index,
}
if start_obj is not None:
if "toolUse" in start_obj and start_obj["toolUse"] is not None:
## check tool name was formatted by litellm
_response_tool_name = start_obj["toolUse"]["name"]
response_tool_name = get_bedrock_tool_name(
response_tool_name=_response_tool_name
)
tool_use = {
"id": start_obj["toolUse"]["toolUseId"],
"type": "function",
"function": {
"name": response_tool_name,
"arguments": "",
},
"index": index,
}
elif (
"reasoningContent" in start_obj
and start_obj["reasoningContent"] is not None
): # redacted thinking can be in start object
thinking_blocks = self.translate_thinking_blocks(
start_obj["reasoningContent"]
)
provider_specific_fields = {
"reasoningContent": start_obj["reasoningContent"],
}
elif "delta" in chunk_data:
delta_obj = ContentBlockDeltaEvent(**chunk_data["delta"])
self.content_blocks.append(delta_obj)

View file

@ -650,6 +650,49 @@ class HTTPHandler:
except Exception as e:
raise e
def delete(
self,
url: str,
data: Optional[Union[dict, str]] = None, # type: ignore
json: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
stream: bool = False,
):
try:
if timeout is not None:
req = self.client.build_request(
"DELETE", url, data=data, json=json, params=params, headers=headers, timeout=timeout # type: ignore
)
else:
req = self.client.build_request(
"DELETE", url, data=data, json=json, params=params, headers=headers # type: ignore
)
response = self.client.send(req, stream=stream)
response.raise_for_status()
return response
except httpx.TimeoutException:
raise litellm.Timeout(
message=f"Connection timed out after {timeout} seconds.",
model="default-model-name",
llm_provider="litellm-httpx-handler",
)
except httpx.HTTPStatusError as e:
if stream is True:
setattr(e, "message", mask_sensitive_info(e.response.read()))
setattr(e, "text", mask_sensitive_info(e.response.read()))
else:
error_text = mask_sensitive_info(e.response.text)
setattr(e, "message", error_text)
setattr(e, "text", error_text)
setattr(e, "status_code", e.response.status_code)
raise e
except Exception as e:
raise e
def __del__(self) -> None:
try:
self.close()

View file

@ -36,6 +36,7 @@ from litellm.types.llms.openai import (
ResponsesAPIResponse,
)
from litellm.types.rerank import OptionalRerankParams, RerankResponse
from litellm.types.responses.main import DeleteResponseResult
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
@ -229,13 +230,17 @@ class BaseLLMHTTPHandler:
api_key: Optional[str] = None,
headers: Optional[dict] = {},
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
provider_config: Optional[BaseConfig] = None,
):
json_mode: bool = optional_params.pop("json_mode", False)
extra_body: Optional[dict] = optional_params.pop("extra_body", None)
fake_stream = fake_stream or optional_params.pop("fake_stream", False)
provider_config = ProviderConfigManager.get_provider_chat_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
provider_config = (
provider_config
or ProviderConfigManager.get_provider_chat_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
)
)
if provider_config is None:
raise ValueError(
@ -1011,6 +1016,7 @@ class BaseLLMHTTPHandler:
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
fake_stream: bool = False,
litellm_metadata: Optional[Dict[str, Any]] = None,
) -> Union[
ResponsesAPIResponse,
BaseResponsesAPIStreamingIterator,
@ -1037,6 +1043,7 @@ class BaseLLMHTTPHandler:
timeout=timeout,
client=client if isinstance(client, AsyncHTTPHandler) else None,
fake_stream=fake_stream,
litellm_metadata=litellm_metadata,
)
if client is None or not isinstance(client, HTTPHandler):
@ -1060,11 +1067,7 @@ class BaseLLMHTTPHandler:
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
api_key=litellm_params.api_key,
model=model,
optional_params=response_api_optional_request_params,
litellm_params=dict(litellm_params),
stream=stream,
)
data = responses_api_provider_config.transform_responses_api_request(
@ -1109,6 +1112,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
return SyncResponsesAPIStreamingIterator(
@ -1116,6 +1121,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
else:
# For non-streaming requests
@ -1152,6 +1159,7 @@ class BaseLLMHTTPHandler:
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
fake_stream: bool = False,
litellm_metadata: Optional[Dict[str, Any]] = None,
) -> Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]:
"""
Async version of the responses API handler.
@ -1179,11 +1187,7 @@ class BaseLLMHTTPHandler:
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
api_key=litellm_params.api_key,
model=model,
optional_params=response_api_optional_request_params,
litellm_params=dict(litellm_params),
stream=stream,
)
data = responses_api_provider_config.transform_responses_api_request(
@ -1230,6 +1234,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
# Return the streaming iterator
@ -1238,6 +1244,8 @@ class BaseLLMHTTPHandler:
model=model,
logging_obj=logging_obj,
responses_api_provider_config=responses_api_provider_config,
litellm_metadata=litellm_metadata,
custom_llm_provider=custom_llm_provider,
)
else:
# For non-streaming, proceed as before
@ -1261,6 +1269,163 @@ class BaseLLMHTTPHandler:
logging_obj=logging_obj,
)
async def async_delete_response_api_handler(
self,
response_id: str,
responses_api_provider_config: BaseResponsesAPIConfig,
litellm_params: GenericLiteLLMParams,
logging_obj: LiteLLMLoggingObj,
custom_llm_provider: Optional[str],
extra_headers: Optional[Dict[str, Any]] = None,
extra_body: Optional[Dict[str, Any]] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
) -> DeleteResponseResult:
"""
Async version of the delete response API handler.
Uses async HTTP client to make requests.
"""
if client is None or not isinstance(client, AsyncHTTPHandler):
async_httpx_client = get_async_httpx_client(
llm_provider=litellm.LlmProviders(custom_llm_provider),
params={"ssl_verify": litellm_params.get("ssl_verify", None)},
)
else:
async_httpx_client = client
headers = responses_api_provider_config.validate_environment(
api_key=litellm_params.api_key,
headers=extra_headers or {},
model="None",
)
if extra_headers:
headers.update(extra_headers)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
litellm_params=dict(litellm_params),
)
url, data = responses_api_provider_config.transform_delete_response_api_request(
response_id=response_id,
api_base=api_base,
litellm_params=litellm_params,
headers=headers,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key="",
additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
)
try:
response = await async_httpx_client.delete(
url=url, headers=headers, data=json.dumps(data), timeout=timeout
)
except Exception as e:
raise self._handle_error(
e=e,
provider_config=responses_api_provider_config,
)
return responses_api_provider_config.transform_delete_response_api_response(
raw_response=response,
logging_obj=logging_obj,
)
def delete_response_api_handler(
self,
response_id: str,
responses_api_provider_config: BaseResponsesAPIConfig,
litellm_params: GenericLiteLLMParams,
logging_obj: LiteLLMLoggingObj,
custom_llm_provider: Optional[str],
extra_headers: Optional[Dict[str, Any]] = None,
extra_body: Optional[Dict[str, Any]] = None,
timeout: Optional[Union[float, httpx.Timeout]] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
_is_async: bool = False,
) -> Union[DeleteResponseResult, Coroutine[Any, Any, DeleteResponseResult]]:
"""
Async version of the responses API handler.
Uses async HTTP client to make requests.
"""
if _is_async:
return self.async_delete_response_api_handler(
response_id=response_id,
responses_api_provider_config=responses_api_provider_config,
litellm_params=litellm_params,
logging_obj=logging_obj,
custom_llm_provider=custom_llm_provider,
extra_headers=extra_headers,
extra_body=extra_body,
timeout=timeout,
client=client,
)
if client is None or not isinstance(client, HTTPHandler):
sync_httpx_client = _get_httpx_client(
params={"ssl_verify": litellm_params.get("ssl_verify", None)}
)
else:
sync_httpx_client = client
headers = responses_api_provider_config.validate_environment(
api_key=litellm_params.api_key,
headers=extra_headers or {},
model="None",
)
if extra_headers:
headers.update(extra_headers)
api_base = responses_api_provider_config.get_complete_url(
api_base=litellm_params.api_base,
litellm_params=dict(litellm_params),
)
url, data = responses_api_provider_config.transform_delete_response_api_request(
response_id=response_id,
api_base=api_base,
litellm_params=litellm_params,
headers=headers,
)
## LOGGING
logging_obj.pre_call(
input=input,
api_key="",
additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
)
try:
response = sync_httpx_client.delete(
url=url, headers=headers, data=json.dumps(data), timeout=timeout
)
except Exception as e:
raise self._handle_error(
e=e,
provider_config=responses_api_provider_config,
)
return responses_api_provider_config.transform_delete_response_api_response(
raw_response=response,
logging_obj=logging_obj,
)
def create_file(
self,
create_file_data: CreateFileRequest,

View file

@ -37,6 +37,7 @@ from litellm.types.llms.databricks import (
)
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock,
ChatCompletionToolChoiceFunctionParam,
ChatCompletionToolChoiceObjectParam,
@ -314,13 +315,24 @@ class DatabricksConfig(DatabricksBase, OpenAILikeChatConfig, AnthropicConfig):
@staticmethod
def extract_reasoning_content(
content: Optional[AllDatabricksContentValues],
) -> Tuple[Optional[str], Optional[List[ChatCompletionThinkingBlock]]]:
) -> Tuple[
Optional[str],
Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
],
]:
"""
Extract and return the reasoning content and thinking blocks
"""
if content is None:
return None, None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
thinking_blocks: Optional[
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None
reasoning_content: Optional[str] = None
if isinstance(content, list):
for item in content:

View file

@ -1,15 +1,33 @@
from typing import List, Literal, Optional, Tuple, Union, cast
import json
import uuid
from typing import Any, List, Literal, Optional, Tuple, Union, cast
import httpx
import litellm
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.llm_response_utils.get_headers import (
get_response_headers,
)
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionImageObject,
ChatCompletionToolParam,
OpenAIChatCompletionToolParam,
)
from litellm.types.utils import ProviderSpecificModelInfo
from litellm.types.utils import (
ChatCompletionMessageToolCall,
Choices,
Function,
Message,
ModelResponse,
ProviderSpecificModelInfo,
)
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
from ..common_utils import FireworksAIException
class FireworksAIConfig(OpenAIGPTConfig):
@ -219,6 +237,94 @@ class FireworksAIConfig(OpenAIGPTConfig):
headers=headers,
)
def _handle_message_content_with_tool_calls(
self,
message: Message,
tool_calls: Optional[List[ChatCompletionToolParam]],
) -> Message:
"""
Fireworks AI sends tool calls in the content field instead of tool_calls
Relevant Issue: https://github.com/BerriAI/litellm/issues/7209#issuecomment-2813208780
"""
if (
tool_calls is not None
and message.content is not None
and message.tool_calls is None
):
try:
function = Function(**json.loads(message.content))
if function.name != RESPONSE_FORMAT_TOOL_NAME and function.name in [
tool["function"]["name"] for tool in tool_calls
]:
tool_call = ChatCompletionMessageToolCall(
function=function, id=str(uuid.uuid4()), type="function"
)
message.tool_calls = [tool_call]
message.content = None
except Exception:
pass
return message
def transform_response(
self,
model: str,
raw_response: httpx.Response,
model_response: ModelResponse,
logging_obj: LiteLLMLoggingObj,
request_data: dict,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
encoding: Any,
api_key: Optional[str] = None,
json_mode: Optional[bool] = None,
) -> ModelResponse:
## LOGGING
logging_obj.post_call(
input=messages,
api_key=api_key,
original_response=raw_response.text,
additional_args={"complete_input_dict": request_data},
)
## RESPONSE OBJECT
try:
completion_response = raw_response.json()
except Exception as e:
response_headers = getattr(raw_response, "headers", None)
raise FireworksAIException(
message="Unable to get json response - {}, Original Response: {}".format(
str(e), raw_response.text
),
status_code=raw_response.status_code,
headers=response_headers,
)
raw_response_headers = dict(raw_response.headers)
additional_headers = get_response_headers(raw_response_headers)
response = ModelResponse(**completion_response)
if response.model is not None:
response.model = "fireworks_ai/" + response.model
## FIREWORKS AI sends tool calls in the content field instead of tool_calls
for choice in response.choices:
cast(
Choices, choice
).message = self._handle_message_content_with_tool_calls(
message=cast(Choices, choice).message,
tool_calls=optional_params.get("tools", None),
)
response._hidden_params = {"additional_headers": additional_headers}
return response
def _get_openai_compatible_provider_info(
self, api_base: Optional[str], api_key: Optional[str]
) -> Tuple[Optional[str], Optional[str]]:

View file

@ -7,6 +7,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
)
from litellm.types.llms.openai import AllMessageValues
from litellm.types.llms.vertex_ai import ContentType, PartType
from litellm.utils import supports_reasoning
from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@ -67,7 +68,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
return super().get_config()
def get_supported_openai_params(self, model: str) -> List[str]:
return [
supported_params = [
"temperature",
"top_p",
"max_tokens",
@ -83,6 +84,10 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
"frequency_penalty",
"modalities",
]
if supports_reasoning(model):
supported_params.append("reasoning_effort")
supported_params.append("thinking")
return supported_params
def map_openai_params(
self,

View file

@ -2,9 +2,19 @@
Translate from OpenAI's `/v1/chat/completions` to VLLM's `/v1/chat/completions`
"""
from typing import Optional, Tuple
from typing import List, Optional, Tuple, cast
from litellm.litellm_core_utils.prompt_templates.common_utils import (
_get_image_mime_type_from_url,
)
from litellm.litellm_core_utils.prompt_templates.factory import _parse_mime_type
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionFileObject,
ChatCompletionVideoObject,
ChatCompletionVideoUrlObject,
)
from ....utils import _remove_additional_properties, _remove_strict_from_schema
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
@ -38,3 +48,71 @@ class HostedVLLMChatConfig(OpenAIGPTConfig):
api_key or get_secret_str("HOSTED_VLLM_API_KEY") or "fake-api-key"
) # vllm does not require an api key
return api_base, dynamic_api_key
def _is_video_file(self, content_item: ChatCompletionFileObject) -> bool:
"""
Check if the file is a video
- format: video/<extension>
- file_data: base64 encoded video data
- file_id: infer mp4 from extension
"""
file = content_item.get("file", {})
format = file.get("format")
file_data = file.get("file_data")
file_id = file.get("file_id")
if content_item.get("type") != "file":
return False
if format and format.startswith("video/"):
return True
elif file_data:
mime_type = _parse_mime_type(file_data)
if mime_type and mime_type.startswith("video/"):
return True
elif file_id:
mime_type = _get_image_mime_type_from_url(file_id)
if mime_type and mime_type.startswith("video/"):
return True
return False
def _convert_file_to_video_url(
self, content_item: ChatCompletionFileObject
) -> ChatCompletionVideoObject:
file = content_item.get("file", {})
file_id = file.get("file_id")
file_data = file.get("file_data")
if file_id:
return ChatCompletionVideoObject(
type="video_url", video_url=ChatCompletionVideoUrlObject(url=file_id)
)
elif file_data:
return ChatCompletionVideoObject(
type="video_url", video_url=ChatCompletionVideoUrlObject(url=file_data)
)
raise ValueError("file_id or file_data is required")
def _transform_messages(
self, messages: List[AllMessageValues], model: str
) -> List[AllMessageValues]:
"""
Support translating video files from file_id or file_data to video_url
"""
for message in messages:
if message["role"] == "user":
message_content = message.get("content")
if message_content and isinstance(message_content, list):
replaced_content_items: List[
Tuple[int, ChatCompletionFileObject]
] = []
for idx, content_item in enumerate(message_content):
if content_item.get("type") == "file":
content_item = cast(ChatCompletionFileObject, content_item)
if self._is_video_file(content_item):
replaced_content_items.append((idx, content_item))
for idx, content_item in replaced_content_items:
message_content[idx] = self._convert_file_to_video_url(
content_item
)
transformed_messages = super()._transform_messages(messages, model)
return transformed_messages

View file

@ -1,10 +1,16 @@
from typing import Union
import httpx
from litellm.llms.base_llm.chat.transformation import BaseLLMException
class InfinityError(BaseLLMException):
def __init__(self, status_code, message):
def __init__(
self,
status_code: int,
message: str,
headers: Union[dict, httpx.Headers] = {}
):
self.status_code = status_code
self.message = message
self.request = httpx.Request(
@ -16,4 +22,5 @@ class InfinityError(BaseLLMException):
message=message,
request=self.request,
response=self.response,
headers=headers,
) # Call the base class constructor with the parameters it needs

View file

@ -0,0 +1,5 @@
"""
Infinity Embedding - uses `llm_http_handler.py` to make httpx requests
Request/Response transformation is handled in `transformation.py`
"""

View file

@ -0,0 +1,141 @@
from typing import List, Optional, Union
import httpx
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import AllEmbeddingInputValues, AllMessageValues
from litellm.types.utils import EmbeddingResponse, Usage
from ..common_utils import InfinityError
class InfinityEmbeddingConfig(BaseEmbeddingConfig):
"""
Reference: https://infinity.modal.michaelfeil.eu/docs
"""
def __init__(self) -> None:
pass
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
if api_base is None:
raise ValueError("api_base is required for Infinity embeddings")
# Remove trailing slashes and ensure clean base URL
api_base = api_base.rstrip("/")
if not api_base.endswith("/embeddings"):
api_base = f"{api_base}/embeddings"
return api_base
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
if api_key is None:
api_key = get_secret_str("INFINITY_API_KEY")
default_headers = {
"Authorization": f"Bearer {api_key}",
"accept": "application/json",
"Content-Type": "application/json",
}
# If 'Authorization' is provided in headers, it overrides the default.
if "Authorization" in headers:
default_headers["Authorization"] = headers["Authorization"]
# Merge other headers, overriding any default ones except Authorization
return {**default_headers, **headers}
def get_supported_openai_params(self, model: str) -> list:
return [
"encoding_format",
"modality",
"dimensions",
]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
"""
Map OpenAI params to Infinity params
Reference: https://infinity.modal.michaelfeil.eu/docs
"""
if "encoding_format" in non_default_params:
optional_params["encoding_format"] = non_default_params["encoding_format"]
if "modality" in non_default_params:
optional_params["modality"] = non_default_params["modality"]
if "dimensions" in non_default_params:
optional_params["output_dimension"] = non_default_params["dimensions"]
return optional_params
def transform_embedding_request(
self,
model: str,
input: AllEmbeddingInputValues,
optional_params: dict,
headers: dict,
) -> dict:
return {
"input": input,
"model": model,
**optional_params,
}
def transform_embedding_response(
self,
model: str,
raw_response: httpx.Response,
model_response: EmbeddingResponse,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None,
request_data: dict = {},
optional_params: dict = {},
litellm_params: dict = {},
) -> EmbeddingResponse:
try:
raw_response_json = raw_response.json()
except Exception:
raise InfinityError(
message=raw_response.text, status_code=raw_response.status_code
)
# model_response.usage
model_response.model = raw_response_json.get("model")
model_response.data = raw_response_json.get("data")
model_response.object = raw_response_json.get("object")
usage = Usage(
prompt_tokens=raw_response_json.get("usage", {}).get("prompt_tokens", 0),
total_tokens=raw_response_json.get("usage", {}).get("total_tokens", 0),
)
model_response.usage = usage
return model_response
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:
return InfinityError(
message=error_message, status_code=status_code, headers=headers
)

View file

@ -22,7 +22,7 @@ from litellm.types.rerank import (
RerankTokens,
)
from .common_utils import InfinityError
from ..common_utils import InfinityError
class InfinityRerankConfig(CohereRerankConfig):

View file

@ -13,6 +13,7 @@ class LiteLLMProxyChatConfig(OpenAIGPTConfig):
def get_supported_openai_params(self, model: str) -> List:
list = super().get_supported_openai_params(model)
list.append("thinking")
list.append("reasoning_effort")
return list
def _map_openai_params(

View file

@ -7,6 +7,7 @@ from litellm._logging import verbose_logger
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import *
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
from ..common_utils import OpenAIError
@ -110,11 +111,7 @@ class OpenAIResponsesAPIConfig(BaseResponsesAPIConfig):
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
Get the endpoint for OpenAI responses API
@ -190,7 +187,7 @@ class OpenAIResponsesAPIConfig(BaseResponsesAPIConfig):
model_class = event_models.get(cast(ResponsesAPIStreamEvents, event_type))
if not model_class:
raise ValueError(f"Unknown event type: {event_type}")
return GenericEvent
return model_class
@ -217,3 +214,39 @@ class OpenAIResponsesAPIConfig(BaseResponsesAPIConfig):
f"Error getting model info in OpenAIResponsesAPIConfig: {e}"
)
return False
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
"""
Transform the delete response API request into a URL and data
OpenAI API expects the following request
- DELETE /v1/responses/{response_id}
"""
url = f"{api_base}/{response_id}"
data: Dict = {}
return url, data
def transform_delete_response_api_response(
self,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> DeleteResponseResult:
"""
Transform the delete response API response into a DeleteResponseResult
"""
try:
raw_response_json = raw_response.json()
except Exception:
raise OpenAIError(
message=raw_response.text, status_code=raw_response.status_code
)
return DeleteResponseResult(**raw_response_json)

View file

@ -201,8 +201,6 @@ class TritonGenerateConfig(TritonConfig):
"max_tokens": int(
optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
),
"bad_words": [""],
"stop_words": [""],
},
"stream": bool(stream),
}

View file

@ -12,6 +12,9 @@ from pydantic import BaseModel
import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.prompt_templates.common_utils import (
_get_image_mime_type_from_url,
)
from litellm.litellm_core_utils.prompt_templates.factory import (
convert_to_anthropic_image_obj,
convert_to_gemini_tool_call_invoke,
@ -99,62 +102,6 @@ def _process_gemini_image(image_url: str, format: Optional[str] = None) -> PartT
raise e
def _get_image_mime_type_from_url(url: str) -> Optional[str]:
"""
Get mime type for common image URLs
See gemini mime types: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#image-requirements
Supported by Gemini:
application/pdf
audio/mpeg
audio/mp3
audio/wav
image/png
image/jpeg
image/webp
text/plain
video/mov
video/mpeg
video/mp4
video/mpg
video/avi
video/wmv
video/mpegps
video/flv
"""
url = url.lower()
# Map file extensions to mime types
mime_types = {
# Images
(".jpg", ".jpeg"): "image/jpeg",
(".png",): "image/png",
(".webp",): "image/webp",
# Videos
(".mp4",): "video/mp4",
(".mov",): "video/mov",
(".mpeg", ".mpg"): "video/mpeg",
(".avi",): "video/avi",
(".wmv",): "video/wmv",
(".mpegps",): "video/mpegps",
(".flv",): "video/flv",
# Audio
(".mp3",): "audio/mp3",
(".wav",): "audio/wav",
(".mpeg",): "audio/mpeg",
# Documents
(".pdf",): "application/pdf",
(".txt",): "text/plain",
}
# Check each extension group against the URL
for extensions, mime_type in mime_types.items():
if any(url.endswith(ext) for ext in extensions):
return mime_type
return None
def _gemini_convert_messages_with_history( # noqa: PLR0915
messages: List[AllMessageValues],
) -> List[ContentType]:
@ -269,6 +216,11 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
msg_dict = messages[msg_i] # type: ignore
assistant_msg = ChatCompletionAssistantMessage(**msg_dict) # type: ignore
_message_content = assistant_msg.get("content", None)
reasoning_content = assistant_msg.get("reasoning_content", None)
if reasoning_content is not None:
assistant_content.append(
PartType(thought=True, text=reasoning_content)
)
if _message_content is not None and isinstance(_message_content, list):
_parts = []
for element in _message_content:
@ -276,6 +228,7 @@ def _gemini_convert_messages_with_history( # noqa: PLR0915
if element["type"] == "text":
_part = PartType(text=element["text"])
_parts.append(_part)
assistant_content.extend(_parts)
elif (
_message_content is not None

View file

@ -24,6 +24,11 @@ import litellm
import litellm.litellm_core_utils
import litellm.litellm_core_utils.litellm_logging
from litellm import verbose_logger
from litellm.constants import (
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.llms.custom_httpx.http_handler import (
@ -31,6 +36,7 @@ from litellm.llms.custom_httpx.http_handler import (
HTTPHandler,
get_async_httpx_client,
)
from litellm.types.llms.anthropic import AnthropicThinkingParam
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionResponseMessage,
@ -45,11 +51,13 @@ from litellm.types.llms.vertex_ai import (
ContentType,
FunctionCallingConfig,
FunctionDeclaration,
GeminiThinkingConfig,
GenerateContentResponseBody,
HttpxPartType,
LogprobsResult,
ToolConfig,
Tools,
UsageMetadata,
)
from litellm.types.utils import (
ChatCompletionTokenLogprob,
@ -59,7 +67,7 @@ from litellm.types.utils import (
TopLogprob,
Usage,
)
from litellm.utils import CustomStreamWrapper, ModelResponse
from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning
from ....utils import _remove_additional_properties, _remove_strict_from_schema
from ..common_utils import VertexAIError, _build_vertex_schema
@ -190,7 +198,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
return super().get_config()
def get_supported_openai_params(self, model: str) -> List[str]:
return [
supported_params = [
"temperature",
"top_p",
"max_tokens",
@ -210,6 +218,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
"top_logprobs",
"modalities",
]
if supports_reasoning(model):
supported_params.append("reasoning_effort")
supported_params.append("thinking")
return supported_params
def map_tool_choice_values(
self, model: str, tool_choice: Union[str, dict]
@ -313,10 +325,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
if isinstance(old_schema, list):
for item in old_schema:
if isinstance(item, dict):
item = _build_vertex_schema(parameters=item, add_property_ordering=True)
item = _build_vertex_schema(
parameters=item, add_property_ordering=True
)
elif isinstance(old_schema, dict):
old_schema = _build_vertex_schema(parameters=old_schema, add_property_ordering=True)
old_schema = _build_vertex_schema(
parameters=old_schema, add_property_ordering=True
)
return old_schema
def apply_response_schema_transformation(self, value: dict, optional_params: dict):
@ -343,6 +359,43 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
value=optional_params["response_schema"]
)
@staticmethod
def _map_reasoning_effort_to_thinking_budget(
reasoning_effort: str,
) -> GeminiThinkingConfig:
if reasoning_effort == "low":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "medium":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "high":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
"includeThoughts": True,
}
else:
raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
@staticmethod
def _map_thinking_param(
thinking_param: AnthropicThinkingParam,
) -> GeminiThinkingConfig:
thinking_enabled = thinking_param.get("type") == "enabled"
thinking_budget = thinking_param.get("budget_tokens")
params: GeminiThinkingConfig = {}
if thinking_enabled:
params["includeThoughts"] = True
if thinking_budget is not None and isinstance(thinking_budget, int):
params["thinkingBudget"] = thinking_budget
return params
def map_openai_params(
self,
non_default_params: Dict,
@ -399,6 +452,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
optional_params["tool_choice"] = _tool_choice_value
elif param == "seed":
optional_params["seed"] = value
elif param == "reasoning_effort" and isinstance(value, str):
optional_params[
"thinkingConfig"
] = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(value)
elif param == "thinking":
optional_params[
"thinkingConfig"
] = VertexGeminiConfig._map_thinking_param(
cast(AnthropicThinkingParam, value)
)
elif param == "modalities" and isinstance(value, list):
response_modalities = []
for modality in value:
@ -514,19 +577,28 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
def get_assistant_content_message(
self, parts: List[HttpxPartType]
) -> Optional[str]:
_content_str = ""
) -> Tuple[Optional[str], Optional[str]]:
content_str: Optional[str] = None
reasoning_content_str: Optional[str] = None
for part in parts:
_content_str = ""
if "text" in part:
_content_str += part["text"]
elif "inlineData" in part: # base64 encoded image
_content_str += "data:{};base64,{}".format(
part["inlineData"]["mimeType"], part["inlineData"]["data"]
)
if len(_content_str) > 0:
if part.get("thought") is True:
if reasoning_content_str is None:
reasoning_content_str = ""
reasoning_content_str += _content_str
else:
if content_str is None:
content_str = ""
content_str += _content_str
if _content_str:
return _content_str
return None
return content_str, reasoning_content_str
def _transform_parts(
self,
@ -669,6 +741,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
return model_response
def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
"""
Check if the candidate token count is inclusive of the thinking token count
if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
else the candidate token count is exclusive of the thinking token count
Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
"""
if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
"candidatesTokenCount", 0
) == usage_metadata.get("totalTokenCount", 0):
return True
else:
return False
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
@ -677,6 +766,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens: Optional[int] = None
text_tokens: Optional[int] = None
prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
reasoning_tokens: Optional[int] = None
if "cachedContentTokenCount" in completion_response["usageMetadata"]:
cached_tokens = completion_response["usageMetadata"][
"cachedContentTokenCount"
@ -687,22 +777,35 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens = detail["tokenCount"]
elif detail["modality"] == "TEXT":
text_tokens = detail["tokenCount"]
if "thoughtsTokenCount" in completion_response["usageMetadata"]:
reasoning_tokens = completion_response["usageMetadata"][
"thoughtsTokenCount"
]
prompt_tokens_details = PromptTokensDetailsWrapper(
cached_tokens=cached_tokens,
audio_tokens=audio_tokens,
text_tokens=text_tokens,
)
completion_tokens = completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
)
if (
not self.is_candidate_token_count_inclusive(
completion_response["usageMetadata"]
)
and reasoning_tokens
):
completion_tokens = reasoning_tokens + completion_tokens
## GET USAGE ##
usage = Usage(
prompt_tokens=completion_response["usageMetadata"].get(
"promptTokenCount", 0
),
completion_tokens=completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
),
completion_tokens=completion_tokens,
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
)
return usage
@ -731,11 +834,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
citation_metadata.append(candidate["citationMetadata"])
if "parts" in candidate["content"]:
chat_completion_message[
"content"
] = VertexGeminiConfig().get_assistant_content_message(
(
content,
reasoning_content,
) = VertexGeminiConfig().get_assistant_content_message(
parts=candidate["content"]["parts"]
)
if content is not None:
chat_completion_message["content"] = content
if reasoning_content is not None:
chat_completion_message["reasoning_content"] = reasoning_content
functions, tools = self._transform_parts(
parts=candidate["content"]["parts"],

View file

@ -38,7 +38,7 @@ def generate_iam_token(api_key=None, **params) -> str:
headers = {}
headers["Content-Type"] = "application/x-www-form-urlencoded"
if api_key is None:
api_key = get_secret_str("WX_API_KEY") or get_secret_str("WATSONX_API_KEY")
api_key = get_secret_str("WX_API_KEY") or get_secret_str("WATSONX_API_KEY") or get_secret_str("WATSONX_APIKEY")
if api_key is None:
raise ValueError("API key is required")
headers["Accept"] = "application/json"

View file

@ -1435,6 +1435,7 @@ def completion( # type: ignore # noqa: PLR0915
custom_llm_provider=custom_llm_provider,
encoding=encoding,
stream=stream,
provider_config=provider_config,
)
except Exception as e:
## LOGGING - log the original exception returned
@ -1596,6 +1597,37 @@ def completion( # type: ignore # noqa: PLR0915
additional_args={"headers": headers},
)
response = _response
elif custom_llm_provider == "fireworks_ai":
## COMPLETION CALL
try:
response = base_llm_http_handler.completion(
model=model,
messages=messages,
headers=headers,
model_response=model_response,
api_key=api_key,
api_base=api_base,
acompletion=acompletion,
logging_obj=logging,
optional_params=optional_params,
litellm_params=litellm_params,
timeout=timeout, # type: ignore
client=client,
custom_llm_provider=custom_llm_provider,
encoding=encoding,
stream=stream,
provider_config=provider_config,
)
except Exception as e:
## LOGGING - log the original exception returned
logging.post_call(
input=messages,
api_key=api_key,
original_response=str(e),
additional_args={"headers": headers},
)
raise e
elif custom_llm_provider == "groq":
api_base = (
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
@ -3852,6 +3884,21 @@ def embedding( # noqa: PLR0915
aembedding=aembedding,
litellm_params={},
)
elif custom_llm_provider == "infinity":
response = base_llm_http_handler.embedding(
model=model,
input=input,
custom_llm_provider=custom_llm_provider,
api_base=api_base,
api_key=api_key,
logging_obj=logging,
timeout=timeout,
model_response=EmbeddingResponse(),
optional_params=optional_params,
client=client,
aembedding=aembedding,
litellm_params={},
)
elif custom_llm_provider == "watsonx":
credentials = IBMWatsonXMixin.get_watsonx_credentials(
optional_params=optional_params, api_key=api_key, api_base=api_base

View file

@ -5,6 +5,7 @@
"max_output_tokens": "max output tokens, if the provider specifies it. if not default to max_tokens",
"input_cost_per_token": 0.0000,
"output_cost_per_token": 0.000,
"output_cost_per_reasoning_token": 0.000,
"litellm_provider": "one of https://docs.litellm.ai/docs/providers",
"mode": "one of: chat, embedding, completion, image_generation, audio_transcription, audio_speech, image_generation, moderation, rerank",
"supports_function_calling": true,
@ -1471,6 +1472,72 @@
"litellm_provider": "openai",
"supported_endpoints": ["/v1/audio/speech"]
},
"azure/computer-use-preview": {
"max_tokens": 1024,
"max_input_tokens": 8192,
"max_output_tokens": 1024,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000012,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_prompt_caching": false,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_reasoning": true
},
"azure/gpt-4o-audio-preview-2024-12-17": {
"max_tokens": 16384,
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"input_cost_per_token": 0.0000025,
"input_cost_per_audio_token": 0.00004,
"output_cost_per_token": 0.00001,
"output_cost_per_audio_token": 0.00008,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions"],
"supported_modalities": ["text", "audio"],
"supported_output_modalities": ["text", "audio"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": false,
"supports_vision": false,
"supports_prompt_caching": false,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true,
"supports_reasoning": false
},
"azure/gpt-4o-mini-audio-preview-2024-12-17": {
"max_tokens": 16384,
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"input_cost_per_token": 0.0000025,
"input_cost_per_audio_token": 0.00004,
"output_cost_per_token": 0.00001,
"output_cost_per_audio_token": 0.00008,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions"],
"supported_modalities": ["text", "audio"],
"supported_output_modalities": ["text", "audio"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": false,
"supports_vision": false,
"supports_prompt_caching": false,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true,
"supports_reasoning": false
},
"azure/gpt-4.1": {
"max_tokens": 32768,
"max_input_tokens": 1047576,
@ -1529,6 +1596,170 @@
"search_context_size_high": 50e-3
}
},
"azure/gpt-4.1-mini": {
"max_tokens": 32768,
"max_input_tokens": 1047576,
"max_output_tokens": 32768,
"input_cost_per_token": 0.4e-6,
"output_cost_per_token": 1.6e-6,
"input_cost_per_token_batches": 0.2e-6,
"output_cost_per_token_batches": 0.8e-6,
"cache_read_input_token_cost": 0.1e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true,
"supports_web_search": true,
"search_context_cost_per_query": {
"search_context_size_low": 25e-3,
"search_context_size_medium": 27.5e-3,
"search_context_size_high": 30e-3
}
},
"azure/gpt-4.1-mini-2025-04-14": {
"max_tokens": 32768,
"max_input_tokens": 1047576,
"max_output_tokens": 32768,
"input_cost_per_token": 0.4e-6,
"output_cost_per_token": 1.6e-6,
"input_cost_per_token_batches": 0.2e-6,
"output_cost_per_token_batches": 0.8e-6,
"cache_read_input_token_cost": 0.1e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true,
"supports_web_search": true,
"search_context_cost_per_query": {
"search_context_size_low": 25e-3,
"search_context_size_medium": 27.5e-3,
"search_context_size_high": 30e-3
}
},
"azure/gpt-4.1-nano": {
"max_tokens": 32768,
"max_input_tokens": 1047576,
"max_output_tokens": 32768,
"input_cost_per_token": 0.1e-6,
"output_cost_per_token": 0.4e-6,
"input_cost_per_token_batches": 0.05e-6,
"output_cost_per_token_batches": 0.2e-6,
"cache_read_input_token_cost": 0.025e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true
},
"azure/gpt-4.1-nano-2025-04-14": {
"max_tokens": 32768,
"max_input_tokens": 1047576,
"max_output_tokens": 32768,
"input_cost_per_token": 0.1e-6,
"output_cost_per_token": 0.4e-6,
"input_cost_per_token_batches": 0.05e-6,
"output_cost_per_token_batches": 0.2e-6,
"cache_read_input_token_cost": 0.025e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_response_schema": true,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_system_messages": true,
"supports_tool_choice": true,
"supports_native_streaming": true
},
"azure/o3": {
"max_tokens": 100000,
"max_input_tokens": 200000,
"max_output_tokens": 100000,
"input_cost_per_token": 1e-5,
"output_cost_per_token": 4e-5,
"cache_read_input_token_cost": 2.5e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": false,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_reasoning": true,
"supports_tool_choice": true
},
"azure/o3-2025-04-16": {
"max_tokens": 100000,
"max_input_tokens": 200000,
"max_output_tokens": 100000,
"input_cost_per_token": 1e-5,
"output_cost_per_token": 4e-5,
"cache_read_input_token_cost": 2.5e-6,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": false,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_reasoning": true,
"supports_tool_choice": true
},
"azure/o4-mini": {
"max_tokens": 100000,
"max_input_tokens": 200000,
"max_output_tokens": 100000,
"input_cost_per_token": 1.1e-6,
"output_cost_per_token": 4.4e-6,
"cache_read_input_token_cost": 2.75e-7,
"litellm_provider": "azure",
"mode": "chat",
"supported_endpoints": ["/v1/chat/completions", "/v1/batch", "/v1/responses"],
"supported_modalities": ["text", "image"],
"supported_output_modalities": ["text"],
"supports_function_calling": true,
"supports_parallel_function_calling": false,
"supports_vision": true,
"supports_prompt_caching": true,
"supports_response_schema": true,
"supports_reasoning": true,
"supports_tool_choice": true
},
"azure/gpt-4o-mini-realtime-preview-2024-12-17": {
"max_tokens": 4096,
"max_input_tokens": 128000,
@ -5178,9 +5409,10 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_audio_token": 0.0000001,
"input_cost_per_token": 0.00000015,
"output_cost_per_token": 0.00000060,
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5188,9 +5420,39 @@
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_audio_output": false,
"supports_tool_choice": true,
"supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
"supported_modalities": ["text", "image", "audio", "video"],
"supported_output_modalities": ["text"],
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
},
"gemini-2.5-flash-preview-04-17": {
"max_tokens": 65536,
"max_input_tokens": 1048576,
"max_output_tokens": 65536,
"max_images_per_prompt": 3000,
"max_videos_per_prompt": 10,
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_response_schema": true,
"supports_audio_output": false,
"supports_tool_choice": true,
"supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
"supported_modalities": ["text", "image", "audio", "video"],
"supported_output_modalities": ["text"],
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
@ -5269,6 +5531,35 @@
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
"supports_tool_choice": true
},
"gemini-2.5-pro-preview-03-25": {
"max_tokens": 65536,
"max_input_tokens": 1048576,
"max_output_tokens": 65536,
"max_images_per_prompt": 3000,
"max_videos_per_prompt": 10,
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_audio_token": 0.00000125,
"input_cost_per_token": 0.00000125,
"input_cost_per_token_above_200k_tokens": 0.0000025,
"output_cost_per_token": 0.00001,
"output_cost_per_token_above_200k_tokens": 0.000015,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_response_schema": true,
"supports_audio_output": false,
"supports_tool_choice": true,
"supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
"supported_modalities": ["text", "image", "audio", "video"],
"supported_output_modalities": ["text"],
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
},
"gemini/gemini-2.0-pro-exp-02-05": {
"max_tokens": 8192,
"max_input_tokens": 2097152,

@ -0,0 +1 @@
Subproject commit bf0485467c343957ba5c217db777f407b2e65453

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[461],{8672:function(e,t,n){Promise.resolve().then(n.bind(n,12011))},12011:function(e,t,n){"use strict";n.r(t),n.d(t,{default:function(){return S}});var s=n(57437),o=n(2265),a=n(99376),c=n(20831),i=n(94789),l=n(12514),r=n(49804),u=n(67101),m=n(84264),d=n(49566),h=n(96761),x=n(84566),p=n(19250),f=n(14474),k=n(13634),g=n(73002),j=n(3914);function S(){let[e]=k.Z.useForm(),t=(0,a.useSearchParams)();(0,j.e)("token");let n=t.get("invitation_id"),[S,w]=(0,o.useState)(null),[Z,_]=(0,o.useState)(""),[N,b]=(0,o.useState)(""),[T,y]=(0,o.useState)(null),[E,v]=(0,o.useState)(""),[C,U]=(0,o.useState)("");return(0,o.useEffect)(()=>{n&&(0,p.W_)(n).then(e=>{let t=e.login_url;console.log("login_url:",t),v(t);let n=e.token,s=(0,f.o)(n);U(n),console.log("decoded:",s),w(s.key),console.log("decoded user email:",s.user_email),b(s.user_email),y(s.user_id)})},[n]),(0,s.jsx)("div",{className:"mx-auto w-full max-w-md mt-10",children:(0,s.jsxs)(l.Z,{children:[(0,s.jsx)(h.Z,{className:"text-sm mb-5 text-center",children:"\uD83D\uDE85 LiteLLM"}),(0,s.jsx)(h.Z,{className:"text-xl",children:"Sign up"}),(0,s.jsx)(m.Z,{children:"Claim your user account to login to Admin UI."}),(0,s.jsx)(i.Z,{className:"mt-4",title:"SSO",icon:x.GH$,color:"sky",children:(0,s.jsxs)(u.Z,{numItems:2,className:"flex justify-between items-center",children:[(0,s.jsx)(r.Z,{children:"SSO is under the Enterprise Tier."}),(0,s.jsx)(r.Z,{children:(0,s.jsx)(c.Z,{variant:"primary",className:"mb-2",children:(0,s.jsx)("a",{href:"https://forms.gle/W3U4PZpJGFHWtHyA9",target:"_blank",children:"Get Free Trial"})})})]})}),(0,s.jsxs)(k.Z,{className:"mt-10 mb-5 mx-auto",layout:"vertical",onFinish:e=>{console.log("in handle submit. accessToken:",S,"token:",C,"formValues:",e),S&&C&&(e.user_email=N,T&&n&&(0,p.m_)(S,n,T,e.password).then(e=>{let t="/ui/";t+="?login=success",document.cookie="token="+C,console.log("redirecting to:",t),window.location.href=t}))},children:[(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(k.Z.Item,{label:"Email Address",name:"user_email",children:(0,s.jsx)(d.Z,{type:"email",disabled:!0,value:N,defaultValue:N,className:"max-w-md"})}),(0,s.jsx)(k.Z.Item,{label:"Password",name:"password",rules:[{required:!0,message:"password required to sign up"}],help:"Create a password for your account",children:(0,s.jsx)(d.Z,{placeholder:"",type:"password",className:"max-w-md"})})]}),(0,s.jsx)("div",{className:"mt-10",children:(0,s.jsx)(g.ZP,{htmlType:"submit",children:"Sign Up"})})]})]})})}},3914:function(e,t,n){"use strict";function s(){let e=window.location.hostname,t=["Lax","Strict","None"];["/","/ui"].forEach(n=>{document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,";"),document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; domain=").concat(e,";"),t.forEach(t=>{let s="None"===t?" Secure;":"";document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; SameSite=").concat(t,";").concat(s),document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; domain=").concat(e,"; SameSite=").concat(t,";").concat(s)})}),console.log("After clearing cookies:",document.cookie)}function o(e){let t=document.cookie.split("; ").find(t=>t.startsWith(e+"="));return t?t.split("=")[1]:null}n.d(t,{b:function(){return s},e:function(){return o}})}},function(e){e.O(0,[665,42,899,250,971,117,744],function(){return e(e.s=8672)}),_N_E=e.O()}]);

View file

@ -1 +0,0 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[461],{8672:function(e,t,n){Promise.resolve().then(n.bind(n,12011))},12011:function(e,t,n){"use strict";n.r(t),n.d(t,{default:function(){return S}});var s=n(57437),o=n(2265),a=n(99376),i=n(20831),c=n(94789),l=n(12514),r=n(49804),u=n(67101),d=n(84264),m=n(49566),h=n(96761),x=n(84566),p=n(19250),f=n(14474),k=n(13634),j=n(73002),g=n(3914);function S(){let[e]=k.Z.useForm(),t=(0,a.useSearchParams)();(0,g.e)("token");let n=t.get("invitation_id"),[S,_]=(0,o.useState)(null),[w,Z]=(0,o.useState)(""),[N,b]=(0,o.useState)(""),[T,v]=(0,o.useState)(null),[y,E]=(0,o.useState)(""),[C,U]=(0,o.useState)("");return(0,o.useEffect)(()=>{n&&(0,p.W_)(n).then(e=>{let t=e.login_url;console.log("login_url:",t),E(t);let n=e.token,s=(0,f.o)(n);U(n),console.log("decoded:",s),_(s.key),console.log("decoded user email:",s.user_email),b(s.user_email),v(s.user_id)})},[n]),(0,s.jsx)("div",{className:"mx-auto w-full max-w-md mt-10",children:(0,s.jsxs)(l.Z,{children:[(0,s.jsx)(h.Z,{className:"text-sm mb-5 text-center",children:"\uD83D\uDE85 LiteLLM"}),(0,s.jsx)(h.Z,{className:"text-xl",children:"Sign up"}),(0,s.jsx)(d.Z,{children:"Claim your user account to login to Admin UI."}),(0,s.jsx)(c.Z,{className:"mt-4",title:"SSO",icon:x.GH$,color:"sky",children:(0,s.jsxs)(u.Z,{numItems:2,className:"flex justify-between items-center",children:[(0,s.jsx)(r.Z,{children:"SSO is under the Enterprise Tier."}),(0,s.jsx)(r.Z,{children:(0,s.jsx)(i.Z,{variant:"primary",className:"mb-2",children:(0,s.jsx)("a",{href:"https://forms.gle/W3U4PZpJGFHWtHyA9",target:"_blank",children:"Get Free Trial"})})})]})}),(0,s.jsxs)(k.Z,{className:"mt-10 mb-5 mx-auto",layout:"vertical",onFinish:e=>{console.log("in handle submit. accessToken:",S,"token:",C,"formValues:",e),S&&C&&(e.user_email=N,T&&n&&(0,p.m_)(S,n,T,e.password).then(e=>{var t;let n="/ui/";n+="?userID="+((null===(t=e.data)||void 0===t?void 0:t.user_id)||e.user_id),document.cookie="token="+C,console.log("redirecting to:",n),window.location.href=n}))},children:[(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(k.Z.Item,{label:"Email Address",name:"user_email",children:(0,s.jsx)(m.Z,{type:"email",disabled:!0,value:N,defaultValue:N,className:"max-w-md"})}),(0,s.jsx)(k.Z.Item,{label:"Password",name:"password",rules:[{required:!0,message:"password required to sign up"}],help:"Create a password for your account",children:(0,s.jsx)(m.Z,{placeholder:"",type:"password",className:"max-w-md"})})]}),(0,s.jsx)("div",{className:"mt-10",children:(0,s.jsx)(j.ZP,{htmlType:"submit",children:"Sign Up"})})]})]})})}},3914:function(e,t,n){"use strict";function s(){let e=window.location.hostname,t=["Lax","Strict","None"];["/","/ui"].forEach(n=>{document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,";"),document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; domain=").concat(e,";"),t.forEach(t=>{let s="None"===t?" Secure;":"";document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; SameSite=").concat(t,";").concat(s),document.cookie="token=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=".concat(n,"; domain=").concat(e,"; SameSite=").concat(t,";").concat(s)})}),console.log("After clearing cookies:",document.cookie)}function o(e){let t=document.cookie.split("; ").find(t=>t.startsWith(e+"="));return t?t.split("=")[1]:null}n.d(t,{b:function(){return s},e:function(){return o}})}},function(e){e.O(0,[665,42,899,250,971,117,744],function(){return e(e.s=8672)}),_N_E=e.O()}]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-4f7318ae681a6d94.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/be22292d8ac48764.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[94226,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-59f99bfbf676f282.js\",\"261\",\"static/chunks/261-57d48f76eec1e568.js\",\"899\",\"static/chunks/899-9af4feaf6f21839c.js\",\"875\",\"static/chunks/875-85b7d9e9afef48d5.js\",\"250\",\"static/chunks/250-7b7f46d48724f856.js\",\"699\",\"static/chunks/699-99a8a36b70ac90c1.js\",\"931\",\"static/chunks/app/page-1e545df8fad65452.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"u3E41CAVE1NTuNPVcBvVa\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/be22292d8ac48764.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-4f7318ae681a6d94.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/3da1b0cfa7d4e161.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[25762,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-59f99bfbf676f282.js\",\"261\",\"static/chunks/261-57d48f76eec1e568.js\",\"899\",\"static/chunks/899-9af4feaf6f21839c.js\",\"860\",\"static/chunks/860-c1d8f124df444312.js\",\"250\",\"static/chunks/250-a927a558002d8fb9.js\",\"699\",\"static/chunks/699-99a8a36b70ac90c1.js\",\"931\",\"static/chunks/app/page-8f2fcc2af91a32fd.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"FPIQgzUY81b7nl8zNun4_\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/3da1b0cfa7d4e161.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>

View file

@ -1,7 +1,7 @@
2:I[19107,[],"ClientPageRoot"]
3:I[94226,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-59f99bfbf676f282.js","261","static/chunks/261-57d48f76eec1e568.js","899","static/chunks/899-9af4feaf6f21839c.js","875","static/chunks/875-85b7d9e9afef48d5.js","250","static/chunks/250-7b7f46d48724f856.js","699","static/chunks/699-99a8a36b70ac90c1.js","931","static/chunks/app/page-1e545df8fad65452.js"],"default",1]
3:I[25762,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-59f99bfbf676f282.js","261","static/chunks/261-57d48f76eec1e568.js","899","static/chunks/899-9af4feaf6f21839c.js","860","static/chunks/860-c1d8f124df444312.js","250","static/chunks/250-a927a558002d8fb9.js","699","static/chunks/699-99a8a36b70ac90c1.js","931","static/chunks/app/page-8f2fcc2af91a32fd.js"],"default",1]
4:I[4707,[],""]
5:I[36423,[],""]
0:["u3E41CAVE1NTuNPVcBvVa",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/be22292d8ac48764.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
0:["FPIQgzUY81b7nl8zNun4_",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/3da1b0cfa7d4e161.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,7 +1,7 @@
2:I[19107,[],"ClientPageRoot"]
3:I[52829,["42","static/chunks/42-59f99bfbf676f282.js","261","static/chunks/261-57d48f76eec1e568.js","250","static/chunks/250-7b7f46d48724f856.js","699","static/chunks/699-99a8a36b70ac90c1.js","418","static/chunks/app/model_hub/page-cde2fb783e81a6c1.js"],"default",1]
3:I[52829,["42","static/chunks/42-59f99bfbf676f282.js","261","static/chunks/261-57d48f76eec1e568.js","250","static/chunks/250-a927a558002d8fb9.js","699","static/chunks/699-99a8a36b70ac90c1.js","418","static/chunks/app/model_hub/page-cde2fb783e81a6c1.js"],"default",1]
4:I[4707,[],""]
5:I[36423,[],""]
0:["u3E41CAVE1NTuNPVcBvVa",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/be22292d8ac48764.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
0:["FPIQgzUY81b7nl8zNun4_",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/3da1b0cfa7d4e161.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,7 +1,7 @@
2:I[19107,[],"ClientPageRoot"]
3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-59f99bfbf676f282.js","899","static/chunks/899-9af4feaf6f21839c.js","250","static/chunks/250-7b7f46d48724f856.js","461","static/chunks/app/onboarding/page-82b2525e758a7201.js"],"default",1]
3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-59f99bfbf676f282.js","899","static/chunks/899-9af4feaf6f21839c.js","250","static/chunks/250-a927a558002d8fb9.js","461","static/chunks/app/onboarding/page-4f4c436bd23d48a0.js"],"default",1]
4:I[4707,[],""]
5:I[36423,[],""]
0:["u3E41CAVE1NTuNPVcBvVa",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/be22292d8ac48764.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
0:["FPIQgzUY81b7nl8zNun4_",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/3da1b0cfa7d4e161.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -26,12 +26,14 @@ model_list:
model: azure/gpt-4.1
api_key: os.environ/AZURE_API_KEY_REALTIME
api_base: https://krris-m2f9a9i7-eastus2.openai.azure.com/
- model_name: "xai/*"
litellm_params:
model: xai/*
api_key: os.environ/XAI_API_KEY
litellm_settings:
num_retries: 0
callbacks: ["prometheus"]
callbacks: ["datadog_llm_observability"]
check_provider_endpoint: true
files_settings:

View file

@ -287,6 +287,7 @@ class LiteLLMRoutes(enum.Enum):
"/v1/models",
# token counter
"/utils/token_counter",
"/utils/transform_request",
# rerank
"/rerank",
"/v1/rerank",
@ -462,6 +463,7 @@ class LiteLLMRoutes(enum.Enum):
"/team/member_delete",
"/team/permissions_list",
"/team/permissions_update",
"/team/daily/activity",
"/model/new",
"/model/update",
"/model/delete",
@ -650,9 +652,9 @@ class GenerateRequestBase(LiteLLMPydanticObjectBase):
allowed_cache_controls: Optional[list] = []
config: Optional[dict] = {}
permissions: Optional[dict] = {}
model_max_budget: Optional[dict] = (
{}
) # {"gpt-4": 5.0, "gpt-3.5-turbo": 5.0}, defaults to {}
model_max_budget: Optional[
dict
] = {} # {"gpt-4": 5.0, "gpt-3.5-turbo": 5.0}, defaults to {}
model_config = ConfigDict(protected_namespaces=())
model_rpm_limit: Optional[dict] = None
@ -685,6 +687,8 @@ class GenerateKeyResponse(KeyRequestBase):
token: Optional[str] = None
created_by: Optional[str] = None
updated_by: Optional[str] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
@model_validator(mode="before")
@classmethod
@ -911,12 +915,12 @@ class NewCustomerRequest(BudgetNewRequest):
alias: Optional[str] = None # human-friendly alias
blocked: bool = False # allow/disallow requests for this end-user
budget_id: Optional[str] = None # give either a budget_id or max_budget
allowed_model_region: Optional[AllowedModelRegion] = (
None # require all user requests to use models in this specific region
)
default_model: Optional[str] = (
None # if no equivalent model in allowed region - default all requests to this model
)
allowed_model_region: Optional[
AllowedModelRegion
] = None # require all user requests to use models in this specific region
default_model: Optional[
str
] = None # if no equivalent model in allowed region - default all requests to this model
@model_validator(mode="before")
@classmethod
@ -938,12 +942,12 @@ class UpdateCustomerRequest(LiteLLMPydanticObjectBase):
blocked: bool = False # allow/disallow requests for this end-user
max_budget: Optional[float] = None
budget_id: Optional[str] = None # give either a budget_id or max_budget
allowed_model_region: Optional[AllowedModelRegion] = (
None # require all user requests to use models in this specific region
)
default_model: Optional[str] = (
None # if no equivalent model in allowed region - default all requests to this model
)
allowed_model_region: Optional[
AllowedModelRegion
] = None # require all user requests to use models in this specific region
default_model: Optional[
str
] = None # if no equivalent model in allowed region - default all requests to this model
class DeleteCustomerRequest(LiteLLMPydanticObjectBase):
@ -1079,9 +1083,9 @@ class BlockKeyRequest(LiteLLMPydanticObjectBase):
class AddTeamCallback(LiteLLMPydanticObjectBase):
callback_name: str
callback_type: Optional[Literal["success", "failure", "success_and_failure"]] = (
"success_and_failure"
)
callback_type: Optional[
Literal["success", "failure", "success_and_failure"]
] = "success_and_failure"
callback_vars: Dict[str, str]
@model_validator(mode="before")
@ -1339,9 +1343,9 @@ class ConfigList(LiteLLMPydanticObjectBase):
stored_in_db: Optional[bool]
field_default_value: Any
premium_field: bool = False
nested_fields: Optional[List[FieldDetail]] = (
None # For nested dictionary or Pydantic fields
)
nested_fields: Optional[
List[FieldDetail]
] = None # For nested dictionary or Pydantic fields
class ConfigGeneralSettings(LiteLLMPydanticObjectBase):
@ -1609,9 +1613,9 @@ class LiteLLM_OrganizationMembershipTable(LiteLLMPydanticObjectBase):
budget_id: Optional[str] = None
created_at: datetime
updated_at: datetime
user: Optional[Any] = (
None # You might want to replace 'Any' with a more specific type if available
)
user: Optional[
Any
] = None # You might want to replace 'Any' with a more specific type if available
litellm_budget_table: Optional[LiteLLM_BudgetTable] = None
model_config = ConfigDict(protected_namespaces=())
@ -2359,9 +2363,9 @@ class TeamModelDeleteRequest(BaseModel):
# Organization Member Requests
class OrganizationMemberAddRequest(OrgMemberAddRequest):
organization_id: str
max_budget_in_organization: Optional[float] = (
None # Users max budget within the organization
)
max_budget_in_organization: Optional[
float
] = None # Users max budget within the organization
class OrganizationMemberDeleteRequest(MemberDeleteRequest):
@ -2550,9 +2554,9 @@ class ProviderBudgetResponse(LiteLLMPydanticObjectBase):
Maps provider names to their budget configs.
"""
providers: Dict[str, ProviderBudgetResponseObject] = (
{}
) # Dictionary mapping provider names to their budget configurations
providers: Dict[
str, ProviderBudgetResponseObject
] = {} # Dictionary mapping provider names to their budget configurations
class ProxyStateVariables(TypedDict):
@ -2680,9 +2684,9 @@ class LiteLLM_JWTAuth(LiteLLMPydanticObjectBase):
enforce_rbac: bool = False
roles_jwt_field: Optional[str] = None # v2 on role mappings
role_mappings: Optional[List[RoleMapping]] = None
object_id_jwt_field: Optional[str] = (
None # can be either user / team, inferred from the role mapping
)
object_id_jwt_field: Optional[
str
] = None # can be either user / team, inferred from the role mapping
scope_mappings: Optional[List[ScopeMapping]] = None
enforce_scope_based_access: bool = False
enforce_team_based_model_access: bool = False

View file

@ -88,7 +88,7 @@ async def common_checks(
9. Check if request body is safe
10. [OPTIONAL] Organization checks - is user_object.organization_id is set, run these checks
"""
_model = request_body.get("model", None)
_model: Optional[str] = cast(Optional[str], request_body.get("model", None))
# 1. If team is blocked
if team_object is not None and team_object.blocked is True:
@ -112,7 +112,7 @@ async def common_checks(
)
## 2.1 If user can call model (if personal key)
if team_object is None and user_object is not None:
if _model and team_object is None and user_object is not None:
await can_user_call_model(
model=_model,
llm_router=llm_router,
@ -644,6 +644,7 @@ async def get_user_object(
proxy_logging_obj: Optional[ProxyLogging] = None,
sso_user_id: Optional[str] = None,
user_email: Optional[str] = None,
check_db_only: Optional[bool] = None,
) -> Optional[LiteLLM_UserTable]:
"""
- Check if user id in proxy User Table
@ -655,12 +656,13 @@ async def get_user_object(
return None
# check if in cache
cached_user_obj = await user_api_key_cache.async_get_cache(key=user_id)
if cached_user_obj is not None:
if isinstance(cached_user_obj, dict):
return LiteLLM_UserTable(**cached_user_obj)
elif isinstance(cached_user_obj, LiteLLM_UserTable):
return cached_user_obj
if not check_db_only:
cached_user_obj = await user_api_key_cache.async_get_cache(key=user_id)
if cached_user_obj is not None:
if isinstance(cached_user_obj, dict):
return LiteLLM_UserTable(**cached_user_obj)
elif isinstance(cached_user_obj, LiteLLM_UserTable):
return cached_user_obj
# else, check db
if prisma_client is None:
raise Exception("No db connected")

View file

@ -199,9 +199,13 @@ class _ProxyDBLogger(CustomLogger):
except Exception as e:
error_msg = f"Error in tracking cost callback - {str(e)}\n Traceback:{traceback.format_exc()}"
model = kwargs.get("model", "")
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
metadata = get_litellm_metadata_from_kwargs(kwargs=kwargs)
litellm_metadata = kwargs.get("litellm_params", {}).get(
"litellm_metadata", {}
)
old_metadata = kwargs.get("litellm_params", {}).get("metadata", {})
call_type = kwargs.get("call_type", "")
error_msg += f"\n Args to _PROXY_track_cost_callback\n model: {model}\n metadata: {metadata}\n call_type: {call_type}\n"
error_msg += f"\n Args to _PROXY_track_cost_callback\n model: {model}\n chosen_metadata: {metadata}\n litellm_metadata: {litellm_metadata}\n old_metadata: {old_metadata}\n call_type: {call_type}\n"
asyncio.create_task(
proxy_logging_obj.failed_tracking_alert(
error_message=error_msg,

View file

@ -433,14 +433,13 @@ class LiteLLMProxyRequestSetup:
) -> Optional[List[str]]:
tags = None
if llm_router and llm_router.enable_tag_filtering is True:
# Check request headers for tags
if "x-litellm-tags" in headers:
if isinstance(headers["x-litellm-tags"], str):
_tags = headers["x-litellm-tags"].split(",")
tags = [tag.strip() for tag in _tags]
elif isinstance(headers["x-litellm-tags"], list):
tags = headers["x-litellm-tags"]
# Check request headers for tags
if "x-litellm-tags" in headers:
if isinstance(headers["x-litellm-tags"], str):
_tags = headers["x-litellm-tags"].split(",")
tags = [tag.strip() for tag in _tags]
elif isinstance(headers["x-litellm-tags"], list):
tags = headers["x-litellm-tags"]
# Check request body for tags
if "tags" in data and isinstance(data["tags"], list):
tags = data["tags"]

View file

@ -1,5 +1,5 @@
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Set, Union
from fastapi import HTTPException, status
@ -39,6 +39,7 @@ def update_breakdown_metrics(
provider_metadata: Dict[str, Dict[str, Any]],
api_key_metadata: Dict[str, Dict[str, Any]],
entity_id_field: Optional[str] = None,
entity_metadata_field: Optional[Dict[str, dict]] = None,
) -> BreakdownMetrics:
"""Updates breakdown metrics for a single record using the existing update_metrics function"""
@ -74,7 +75,8 @@ def update_breakdown_metrics(
metadata=KeyMetadata(
key_alias=api_key_metadata.get(record.api_key, {}).get(
"key_alias", None
)
),
team_id=api_key_metadata.get(record.api_key, {}).get("team_id", None),
), # Add any api_key-specific metadata here
)
breakdown.api_keys[record.api_key].metrics = update_metrics(
@ -87,7 +89,10 @@ def update_breakdown_metrics(
if entity_value:
if entity_value not in breakdown.entities:
breakdown.entities[entity_value] = MetricWithMetadata(
metrics=SpendMetrics(), metadata={}
metrics=SpendMetrics(),
metadata=entity_metadata_field.get(entity_value, {})
if entity_metadata_field
else {},
)
breakdown.entities[entity_value].metrics = update_metrics(
breakdown.entities[entity_value].metrics, record
@ -96,17 +101,32 @@ def update_breakdown_metrics(
return breakdown
async def get_api_key_metadata(
prisma_client: PrismaClient,
api_keys: Set[str],
) -> Dict[str, Dict[str, Any]]:
"""Update api key metadata for a single record."""
key_records = await prisma_client.db.litellm_verificationtoken.find_many(
where={"token": {"in": list(api_keys)}}
)
return {
k.token: {"key_alias": k.key_alias, "team_id": k.team_id} for k in key_records
}
async def get_daily_activity(
prisma_client: Optional[PrismaClient],
table_name: str,
entity_id_field: str,
entity_id: Optional[Union[str, List[str]]],
entity_metadata_field: Optional[Dict[str, dict]],
start_date: Optional[str],
end_date: Optional[str],
model: Optional[str],
api_key: Optional[str],
page: int,
page_size: int,
exclude_entity_ids: Optional[List[str]] = None,
) -> SpendAnalyticsPaginatedResponse:
"""Common function to get daily activity for any entity type."""
if prisma_client is None:
@ -134,11 +154,15 @@ async def get_daily_activity(
where_conditions["model"] = model
if api_key:
where_conditions["api_key"] = api_key
if entity_id:
if entity_id is not None:
if isinstance(entity_id, list):
where_conditions[entity_id_field] = {"in": entity_id}
else:
where_conditions[entity_id_field] = entity_id
if exclude_entity_ids:
where_conditions.setdefault(entity_id_field, {})["not"] = {
"in": exclude_entity_ids
}
# Get total count for pagination
total_count = await getattr(prisma_client.db, table_name).count(
@ -166,12 +190,7 @@ async def get_daily_activity(
model_metadata: Dict[str, Dict[str, Any]] = {}
provider_metadata: Dict[str, Dict[str, Any]] = {}
if api_keys:
key_records = await prisma_client.db.litellm_verificationtoken.find_many(
where={"token": {"in": list(api_keys)}}
)
api_key_metadata.update(
{k.token: {"key_alias": k.key_alias} for k in key_records}
)
api_key_metadata = await get_api_key_metadata(prisma_client, api_keys)
# Process results
results = []
@ -198,6 +217,7 @@ async def get_daily_activity(
provider_metadata,
api_key_metadata,
entity_id_field=entity_id_field,
entity_metadata_field=entity_metadata_field,
)
# Update total metrics

View file

@ -4,11 +4,19 @@ from litellm.proxy._types import (
GenerateKeyRequest,
LiteLLM_ManagementEndpoint_MetadataFields_Premium,
LiteLLM_TeamTable,
LitellmUserRoles,
UserAPIKeyAuth,
)
from litellm.proxy.utils import _premium_user_check
def _user_has_admin_view(user_api_key_dict: UserAPIKeyAuth) -> bool:
return (
user_api_key_dict.user_role == LitellmUserRoles.PROXY_ADMIN
or user_api_key_dict.user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
)
def _is_user_team_admin(
user_api_key_dict: UserAPIKeyAuth, team_obj: LiteLLM_TeamTable
) -> bool:

View file

@ -25,6 +25,8 @@ from litellm._logging import verbose_proxy_logger
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy._types import *
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
from litellm.proxy.management_endpoints.common_daily_activity import get_daily_activity
from litellm.proxy.management_endpoints.common_utils import _user_has_admin_view
from litellm.proxy.management_endpoints.key_management_endpoints import (
generate_key_helper_fn,
prepare_metadata_fields,
@ -34,8 +36,6 @@ from litellm.proxy.management_helpers.utils import management_endpoint_wrapper
from litellm.proxy.utils import handle_exception_on_proxy
from litellm.types.proxy.management_endpoints.common_daily_activity import (
BreakdownMetrics,
DailySpendData,
DailySpendMetadata,
KeyMetadata,
KeyMetricWithMetadata,
LiteLLM_DailyUserSpend,
@ -43,6 +43,9 @@ from litellm.types.proxy.management_endpoints.common_daily_activity import (
SpendAnalyticsPaginatedResponse,
SpendMetrics,
)
from litellm.types.proxy.management_endpoints.internal_user_endpoints import (
UserListResponse,
)
router = APIRouter()
@ -899,15 +902,47 @@ async def get_user_key_counts(
return result
@router.get(
"/user/get_users",
tags=["Internal User management"],
dependencies=[Depends(user_api_key_auth)],
)
def _validate_sort_params(
sort_by: Optional[str], sort_order: str
) -> Optional[Dict[str, str]]:
order_by: Dict[str, str] = {}
if sort_by is None:
return None
# Validate sort_by is a valid column
valid_columns = [
"user_id",
"user_email",
"created_at",
"spend",
"user_alias",
"user_role",
]
if sort_by not in valid_columns:
raise HTTPException(
status_code=400,
detail={
"error": f"Invalid sort column. Must be one of: {', '.join(valid_columns)}"
},
)
# Validate sort_order
if sort_order.lower() not in ["asc", "desc"]:
raise HTTPException(
status_code=400,
detail={"error": "Invalid sort order. Must be 'asc' or 'desc'"},
)
order_by[sort_by] = sort_order.lower()
return order_by
@router.get(
"/user/list",
tags=["Internal User management"],
dependencies=[Depends(user_api_key_auth)],
response_model=UserListResponse,
)
async def get_users(
role: Optional[str] = fastapi.Query(
@ -916,15 +951,29 @@ async def get_users(
user_ids: Optional[str] = fastapi.Query(
default=None, description="Get list of users by user_ids"
),
sso_user_ids: Optional[str] = fastapi.Query(
default=None, description="Get list of users by sso_user_id"
),
user_email: Optional[str] = fastapi.Query(
default=None, description="Filter users by partial email match"
),
team: Optional[str] = fastapi.Query(
default=None, description="Filter users by team id"
),
page: int = fastapi.Query(default=1, ge=1, description="Page number"),
page_size: int = fastapi.Query(
default=25, ge=1, le=100, description="Number of items per page"
),
sort_by: Optional[str] = fastapi.Query(
default=None,
description="Column to sort by (e.g. 'user_id', 'user_email', 'created_at', 'spend')",
),
sort_order: str = fastapi.Query(
default="asc", description="Sort order ('asc' or 'desc')"
),
):
"""
Get a paginated list of users, optionally filtered by role.
Used by the UI to populate the user lists.
Get a paginated list of users with filtering and sorting options.
Parameters:
role: Optional[str]
@ -935,17 +984,20 @@ async def get_users(
- internal_user_viewer
user_ids: Optional[str]
Get list of users by user_ids. Comma separated list of user_ids.
sso_ids: Optional[str]
Get list of users by sso_ids. Comma separated list of sso_ids.
user_email: Optional[str]
Filter users by partial email match
team: Optional[str]
Filter users by team id. Will match if user has this team in their teams array.
page: int
The page number to return
page_size: int
The number of items per page
Currently - admin-only endpoint.
Example curl:
```
http://0.0.0.0:4000/user/list?user_ids=default_user_id,693c1a4a-1cc0-4c7c-afe8-b5d2c8d52e17
```
sort_by: Optional[str]
Column to sort by (e.g. 'user_id', 'user_email', 'created_at', 'spend')
sort_order: Optional[str]
Sort order ('asc' or 'desc')
"""
from litellm.proxy.proxy_server import prisma_client
@ -958,35 +1010,57 @@ async def get_users(
# Calculate skip and take for pagination
skip = (page - 1) * page_size
# Prepare the query conditions
# Build where conditions based on provided parameters
where_conditions: Dict[str, Any] = {}
if role:
where_conditions["user_role"] = {
"contains": role,
"mode": "insensitive", # Case-insensitive search
}
where_conditions["user_role"] = role # Exact match instead of contains
if user_ids and isinstance(user_ids, str):
user_id_list = [uid.strip() for uid in user_ids.split(",") if uid.strip()]
where_conditions["user_id"] = {
"in": user_id_list, # Now passing a list of strings as required by Prisma
"in": user_id_list,
}
users: Optional[
List[LiteLLM_UserTable]
] = await prisma_client.db.litellm_usertable.find_many(
if user_email is not None and isinstance(user_email, str):
where_conditions["user_email"] = {
"contains": user_email,
"mode": "insensitive", # Case-insensitive search
}
if team is not None and isinstance(team, str):
where_conditions["teams"] = {
"has": team # Array contains for string arrays in Prisma
}
if sso_user_ids is not None and isinstance(sso_user_ids, str):
sso_id_list = [sid.strip() for sid in sso_user_ids.split(",") if sid.strip()]
where_conditions["sso_user_id"] = {
"in": sso_id_list,
}
## Filter any none fastapi.Query params - e.g. where_conditions: {'user_email': {'contains': Query(None), 'mode': 'insensitive'}, 'teams': {'has': Query(None)}}
where_conditions = {k: v for k, v in where_conditions.items() if v is not None}
# Build order_by conditions
order_by: Optional[Dict[str, str]] = (
_validate_sort_params(sort_by, sort_order)
if sort_by is not None and isinstance(sort_by, str)
else None
)
users = await prisma_client.db.litellm_usertable.find_many(
where=where_conditions,
skip=skip,
take=page_size,
order={"created_at": "desc"},
order=order_by
if order_by
else {"created_at": "desc"}, # Default to created_at desc if no sort specified
)
# Get total count of user rows
total_count = await prisma_client.db.litellm_usertable.count(
where=where_conditions # type: ignore
)
total_count = await prisma_client.db.litellm_usertable.count(where=where_conditions)
# Get key count for each user
if users is not None:
@ -1009,7 +1083,7 @@ async def get_users(
LiteLLM_UserTableWithKeyCount(
**user.model_dump(), key_count=user_key_counts.get(user.user_id, 0)
)
) # Return full key object
)
else:
user_list = []
@ -1382,136 +1456,22 @@ async def get_user_daily_activity(
)
try:
# Build filter conditions
where_conditions: Dict[str, Any] = {
"date": {
"gte": start_date,
"lte": end_date,
}
}
entity_id: Optional[str] = None
if not _user_has_admin_view(user_api_key_dict):
entity_id = user_api_key_dict.user_id
if model:
where_conditions["model"] = model
if api_key:
where_conditions["api_key"] = api_key
if (
user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
):
where_conditions[
"user_id"
] = user_api_key_dict.user_id # only allow access to own data
# Get total count for pagination
total_count = await prisma_client.db.litellm_dailyuserspend.count(
where=where_conditions
)
# Fetch paginated results
daily_spend_data = await prisma_client.db.litellm_dailyuserspend.find_many(
where=where_conditions,
order=[
{"date": "desc"},
],
skip=(page - 1) * page_size,
take=page_size,
)
daily_spend_data_pydantic_list = [
LiteLLM_DailyUserSpend(**record.model_dump()) for record in daily_spend_data
]
# Get all unique API keys from the spend data
api_keys = set()
for record in daily_spend_data_pydantic_list:
if record.api_key:
api_keys.add(record.api_key)
# Fetch key aliases in bulk
api_key_metadata: Dict[str, Dict[str, Any]] = {}
model_metadata: Dict[str, Dict[str, Any]] = {}
provider_metadata: Dict[str, Dict[str, Any]] = {}
if api_keys:
key_records = await prisma_client.db.litellm_verificationtoken.find_many(
where={"token": {"in": list(api_keys)}}
)
api_key_metadata.update(
{k.token: {"key_alias": k.key_alias} for k in key_records}
)
# Process results
results = []
total_metrics = SpendMetrics()
# Group data by date and other dimensions
grouped_data: Dict[str, Dict[str, Any]] = {}
for record in daily_spend_data_pydantic_list:
date_str = record.date
if date_str not in grouped_data:
grouped_data[date_str] = {
"metrics": SpendMetrics(),
"breakdown": BreakdownMetrics(),
}
# Update metrics
grouped_data[date_str]["metrics"] = update_metrics(
grouped_data[date_str]["metrics"], record
)
# Update breakdowns
grouped_data[date_str]["breakdown"] = update_breakdown_metrics(
grouped_data[date_str]["breakdown"],
record,
model_metadata,
provider_metadata,
api_key_metadata,
)
# Update total metrics
total_metrics.spend += record.spend
total_metrics.prompt_tokens += record.prompt_tokens
total_metrics.completion_tokens += record.completion_tokens
total_metrics.total_tokens += (
record.prompt_tokens + record.completion_tokens
)
total_metrics.cache_read_input_tokens += record.cache_read_input_tokens
total_metrics.cache_creation_input_tokens += (
record.cache_creation_input_tokens
)
total_metrics.api_requests += record.api_requests
total_metrics.successful_requests += record.successful_requests
total_metrics.failed_requests += record.failed_requests
# Convert grouped data to response format
for date_str, data in grouped_data.items():
results.append(
DailySpendData(
date=datetime.strptime(date_str, "%Y-%m-%d").date(),
metrics=data["metrics"],
breakdown=data["breakdown"],
)
)
# Sort results by date
results.sort(key=lambda x: x.date, reverse=True)
return SpendAnalyticsPaginatedResponse(
results=results,
metadata=DailySpendMetadata(
total_spend=total_metrics.spend,
total_prompt_tokens=total_metrics.prompt_tokens,
total_completion_tokens=total_metrics.completion_tokens,
total_tokens=total_metrics.total_tokens,
total_api_requests=total_metrics.api_requests,
total_successful_requests=total_metrics.successful_requests,
total_failed_requests=total_metrics.failed_requests,
total_cache_read_input_tokens=total_metrics.cache_read_input_tokens,
total_cache_creation_input_tokens=total_metrics.cache_creation_input_tokens,
page=page,
total_pages=-(-total_count // page_size), # Ceiling division
has_more=(page * page_size) < total_count,
),
return await get_daily_activity(
prisma_client=prisma_client,
table_name="litellm_dailyuserspend",
entity_id_field="user_id",
entity_id=entity_id,
entity_metadata_field=None,
start_date=start_date,
end_date=end_date,
model=model,
api_key=api_key,
page=page,
page_size=page_size,
)
except Exception as e:

View file

@ -577,12 +577,16 @@ async def generate_key_fn( # noqa: PLR0915
request_type="key", **data_json, table_name="key"
)
response["soft_budget"] = (
data.soft_budget
) # include the user-input soft budget in the response
response[
"soft_budget"
] = data.soft_budget # include the user-input soft budget in the response
response = GenerateKeyResponse(**response)
response.token = (
response.token_id
) # remap token to use the hash, and leave the key in the `key` field [TODO]: clean up generate_key_helper_fn to do this
asyncio.create_task(
KeyManagementEventHooks.async_key_generated_hook(
data=data,
@ -1343,10 +1347,13 @@ async def generate_key_helper_fn( # noqa: PLR0915
create_key_response = await prisma_client.insert_data(
data=key_data, table_name="key"
)
key_data["token_id"] = getattr(create_key_response, "token", None)
key_data["litellm_budget_table"] = getattr(
create_key_response, "litellm_budget_table", None
)
key_data["created_at"] = getattr(create_key_response, "created_at", None)
key_data["updated_at"] = getattr(create_key_response, "updated_at", None)
except Exception as e:
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.generate_key_helper_fn(): Exception occured - {}".format(
@ -1470,10 +1477,10 @@ async def delete_verification_tokens(
try:
if prisma_client:
tokens = [_hash_token_if_needed(token=key) for key in tokens]
_keys_being_deleted: List[LiteLLM_VerificationToken] = (
await prisma_client.db.litellm_verificationtoken.find_many(
where={"token": {"in": tokens}}
)
_keys_being_deleted: List[
LiteLLM_VerificationToken
] = await prisma_client.db.litellm_verificationtoken.find_many(
where={"token": {"in": tokens}}
)
# Assuming 'db' is your Prisma Client instance
@ -1575,9 +1582,9 @@ async def _rotate_master_key(
from litellm.proxy.proxy_server import proxy_config
try:
models: Optional[List] = (
await prisma_client.db.litellm_proxymodeltable.find_many()
)
models: Optional[
List
] = await prisma_client.db.litellm_proxymodeltable.find_many()
except Exception:
models = None
# 2. process model table
@ -1864,11 +1871,11 @@ async def validate_key_list_check(
param="user_id",
code=status.HTTP_403_FORBIDDEN,
)
complete_user_info_db_obj: Optional[BaseModel] = (
await prisma_client.db.litellm_usertable.find_unique(
where={"user_id": user_api_key_dict.user_id},
include={"organization_memberships": True},
)
complete_user_info_db_obj: Optional[
BaseModel
] = await prisma_client.db.litellm_usertable.find_unique(
where={"user_id": user_api_key_dict.user_id},
include={"organization_memberships": True},
)
if complete_user_info_db_obj is None:
@ -1929,10 +1936,10 @@ async def get_admin_team_ids(
if complete_user_info is None:
return []
# Get all teams that user is an admin of
teams: Optional[List[BaseModel]] = (
await prisma_client.db.litellm_teamtable.find_many(
where={"team_id": {"in": complete_user_info.teams}}
)
teams: Optional[
List[BaseModel]
] = await prisma_client.db.litellm_teamtable.find_many(
where={"team_id": {"in": complete_user_info.teams}}
)
if teams is None:
return []

View file

@ -12,7 +12,7 @@ All /tag management endpoints
import datetime
import json
from typing import Dict, Optional
from typing import Dict, List, Optional
from fastapi import APIRouter, Depends, HTTPException
@ -25,6 +25,7 @@ from litellm.proxy.management_endpoints.common_daily_activity import (
get_daily_activity,
)
from litellm.types.tag_management import (
LiteLLM_DailyTagSpendTable,
TagConfig,
TagDeleteRequest,
TagInfoRequest,
@ -301,6 +302,7 @@ async def info_tag(
"/tag/list",
tags=["tag management"],
dependencies=[Depends(user_api_key_auth)],
response_model=List[TagConfig],
)
async def list_tags(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
@ -314,9 +316,33 @@ async def list_tags(
raise HTTPException(status_code=500, detail="Database not connected")
try:
## QUERY STORED TAGS ##
tags_config = await _get_tags_config(prisma_client)
list_of_tags = list(tags_config.values())
return list_of_tags
## QUERY DYNAMIC TAGS ##
dynamic_tags = await prisma_client.db.litellm_dailytagspend.find_many(
distinct=["tag"],
)
dynamic_tags_list = [
LiteLLM_DailyTagSpendTable(**dynamic_tag.model_dump())
for dynamic_tag in dynamic_tags
]
dynamic_tag_config = [
TagConfig(
name=tag.tag,
description="This is just a spend tag that was passed dynamically in a request. It does not control any LLM models.",
models=None,
created_at=tag.created_at.isoformat(),
updated_at=tag.updated_at.isoformat(),
)
for tag in dynamic_tags_list
if tag.tag not in tags_config
]
return list_of_tags + dynamic_tag_config
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@ -400,6 +426,7 @@ async def get_tag_daily_activity(
table_name="litellm_dailytagspend",
entity_id_field="tag",
entity_id=tag_list,
entity_metadata_field=None,
start_date=start_date,
end_date=end_date,
model=model,

View file

@ -56,11 +56,13 @@ from litellm.proxy._types import (
from litellm.proxy.auth.auth_checks import (
allowed_route_check_inside_route,
get_team_object,
get_user_object,
)
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
from litellm.proxy.management_endpoints.common_utils import (
_is_user_team_admin,
_set_object_metadata_field,
_user_has_admin_view,
)
from litellm.proxy.management_endpoints.tag_management_endpoints import (
get_daily_activity,
@ -2091,7 +2093,6 @@ async def update_team_member_permissions(
"/team/daily/activity",
response_model=SpendAnalyticsPaginatedResponse,
tags=["team management"],
dependencies=[Depends(user_api_key_auth)],
)
async def get_team_daily_activity(
team_ids: Optional[str] = None,
@ -2101,6 +2102,8 @@ async def get_team_daily_activity(
api_key: Optional[str] = None,
page: int = 1,
page_size: int = 10,
exclude_team_ids: Optional[str] = None,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Get daily activity for specific teams or all teams.
@ -2113,20 +2116,82 @@ async def get_team_daily_activity(
api_key (Optional[str]): Filter by API key.
page (int): Page number for pagination.
page_size (int): Number of items per page.
exclude_team_ids (Optional[str]): Comma-separated list of team IDs to exclude.
Returns:
SpendAnalyticsPaginatedResponse: Paginated response containing daily activity data.
"""
from litellm.proxy.proxy_server import prisma_client
from litellm.proxy.proxy_server import (
prisma_client,
proxy_logging_obj,
user_api_key_cache,
)
if prisma_client is None:
raise HTTPException(
status_code=500,
detail={"error": CommonProxyErrors.db_not_connected_error.value},
)
# Convert comma-separated tags string to list if provided
team_ids_list = team_ids.split(",") if team_ids else None
exclude_team_ids_list: Optional[List[str]] = None
if exclude_team_ids:
exclude_team_ids_list = (
exclude_team_ids.split(",") if exclude_team_ids else None
)
if not _user_has_admin_view(user_api_key_dict):
user_info = await get_user_object(
user_id=user_api_key_dict.user_id,
prisma_client=prisma_client,
user_id_upsert=False,
user_api_key_cache=user_api_key_cache,
parent_otel_span=user_api_key_dict.parent_otel_span,
proxy_logging_obj=proxy_logging_obj,
check_db_only=True,
)
if user_info is None:
raise HTTPException(
status_code=404,
detail={
"error": "User= {} not found".format(user_api_key_dict.user_id)
},
)
if team_ids_list is None:
team_ids_list = user_info.teams
else:
# check if all team_ids are in user_info.teams
for team_id in team_ids_list:
if team_id not in user_info.teams:
raise HTTPException(
status_code=404,
detail={
"error": "User does not belong to Team= {}. Call `/user/info` to see user's teams".format(
team_id
)
},
)
## Fetch team aliases
where_condition = {}
if team_ids_list:
where_condition["team_id"] = {"in": list(team_ids_list)}
team_aliases = await prisma_client.db.litellm_teamtable.find_many(
where=where_condition
)
team_alias_metadata = {
t.team_id: {"team_alias": t.team_alias} for t in team_aliases
}
return await get_daily_activity(
prisma_client=prisma_client,
table_name="litellm_dailyteamspend",
entity_id_field="team_id",
entity_id=team_ids_list,
entity_metadata_field=team_alias_metadata,
exclude_entity_ids=exclude_team_ids_list,
start_date=start_date,
end_date=end_date,
model=model,

View file

@ -553,7 +553,7 @@ async def auth_callback(request: Request): # noqa: PLR0915
algorithm="HS256",
)
if user_id is not None and isinstance(user_id, str):
litellm_dashboard_ui += "?userID=" + user_id
litellm_dashboard_ui += "?login=success"
redirect_response = RedirectResponse(url=litellm_dashboard_ui, status_code=303)
redirect_response.set_cookie(key="token", value=jwt_token, secure=True)
return redirect_response
@ -592,9 +592,9 @@ async def insert_sso_user(
if user_defined_values.get("max_budget") is None:
user_defined_values["max_budget"] = litellm.max_internal_user_budget
if user_defined_values.get("budget_duration") is None:
user_defined_values["budget_duration"] = (
litellm.internal_user_budget_duration
)
user_defined_values[
"budget_duration"
] = litellm.internal_user_budget_duration
if user_defined_values["user_role"] is None:
user_defined_values["user_role"] = LitellmUserRoles.INTERNAL_USER_VIEW_ONLY
@ -787,9 +787,9 @@ class SSOAuthenticationHandler:
if state:
redirect_params["state"] = state
elif "okta" in generic_authorization_endpoint:
redirect_params["state"] = (
uuid.uuid4().hex
) # set state param for okta - required
redirect_params[
"state"
] = uuid.uuid4().hex # set state param for okta - required
return await generic_sso.get_login_redirect(**redirect_params) # type: ignore
raise ValueError(
"Unknown SSO provider. Please setup SSO with client IDs https://docs.litellm.ai/docs/proxy/admin_ui_sso"
@ -1023,7 +1023,7 @@ class MicrosoftSSOHandler:
original_msft_result = (
await microsoft_sso.verify_and_process(
request=request,
convert_response=False,
convert_response=False, # type: ignore
)
or {}
)
@ -1034,9 +1034,9 @@ class MicrosoftSSOHandler:
# if user is trying to get the raw sso response for debugging, return the raw sso response
if return_raw_sso_response:
original_msft_result[MicrosoftSSOHandler.GRAPH_API_RESPONSE_KEY] = (
user_team_ids
)
original_msft_result[
MicrosoftSSOHandler.GRAPH_API_RESPONSE_KEY
] = user_team_ids
return original_msft_result or {}
result = MicrosoftSSOHandler.openid_from_response(
@ -1086,12 +1086,13 @@ class MicrosoftSSOHandler:
service_principal_group_ids: Optional[List[str]] = []
service_principal_teams: Optional[List[MicrosoftServicePrincipalTeam]] = []
if service_principal_id:
service_principal_group_ids, service_principal_teams = (
await MicrosoftSSOHandler.get_group_ids_from_service_principal(
service_principal_id=service_principal_id,
async_client=async_client,
access_token=access_token,
)
(
service_principal_group_ids,
service_principal_teams,
) = await MicrosoftSSOHandler.get_group_ids_from_service_principal(
service_principal_id=service_principal_id,
async_client=async_client,
access_token=access_token,
)
verbose_proxy_logger.debug(
f"Service principal group IDs: {service_principal_group_ids}"
@ -1103,9 +1104,9 @@ class MicrosoftSSOHandler:
# Fetch user membership from Microsoft Graph API
all_group_ids = []
next_link: Optional[str] = (
MicrosoftSSOHandler.graph_api_user_groups_endpoint
)
next_link: Optional[
str
] = MicrosoftSSOHandler.graph_api_user_groups_endpoint
auth_headers = {"Authorization": f"Bearer {access_token}"}
page_count = 0
@ -1304,7 +1305,7 @@ class GoogleSSOHandler:
return (
await google_sso.verify_and_process(
request=request,
convert_response=False,
convert_response=False, # type: ignore
)
or {}
)

Some files were not shown because too many files have changed in this diff Show more