Compare commits

..

No commits in common. "main" and "v1.66.3-nightly" have entirely different histories.

427 changed files with 5249 additions and 18004 deletions

View file

@ -20,8 +20,6 @@ REPLICATE_API_TOKEN = ""
ANTHROPIC_API_KEY = "" ANTHROPIC_API_KEY = ""
# Infisical # Infisical
INFISICAL_TOKEN = "" INFISICAL_TOKEN = ""
# INFINITY
INFINITY_API_KEY = ""
# Development Configs # Development Configs
LITELLM_MASTER_KEY = "sk-1234" LITELLM_MASTER_KEY = "sk-1234"

3
.gitignore vendored
View file

@ -86,6 +86,3 @@ litellm/proxy/db/migrations/0_init/migration.sql
litellm/proxy/db/migrations/* litellm/proxy/db/migrations/*
litellm/proxy/migrations/*config.yaml litellm/proxy/migrations/*config.yaml
litellm/proxy/migrations/* litellm/proxy/migrations/*
config.yaml
tests/litellm/litellm_core_utils/llm_cost_calc/log.txt
tests/test_custom_dir/*

View file

@ -32,7 +32,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -110,7 +110,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.99, sum(rate(litellm_self_latency_bucket{self=\"self\"}[1m])) by (le))", "expr": "histogram_quantile(0.99, sum(rate(litellm_self_latency_bucket{self=\"self\"}[1m])) by (le))",
@ -125,7 +125,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -216,7 +216,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(increase(litellm_spend_metric_total[30d])) by (hashed_api_key)", "expr": "sum(increase(litellm_spend_metric_total[30d])) by (hashed_api_key)",
@ -232,7 +232,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -309,7 +309,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum by (model) (increase(litellm_requests_metric_total[5m]))", "expr": "sum by (model) (increase(litellm_requests_metric_total[5m]))",
@ -324,7 +324,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -375,7 +375,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(increase(litellm_llm_api_failed_requests_metric_total[1h]))", "expr": "sum(increase(litellm_llm_api_failed_requests_metric_total[1h]))",
@ -390,7 +390,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -468,7 +468,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(increase(litellm_spend_metric_total[30d])) by (model)", "expr": "sum(increase(litellm_spend_metric_total[30d])) by (model)",
@ -483,7 +483,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
@ -560,7 +560,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "rMzWaBvIk"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(increase(litellm_total_tokens_total[5m])) by (model)", "expr": "sum(increase(litellm_total_tokens_total[5m])) by (model)",
@ -579,27 +579,7 @@
"style": "dark", "style": "dark",
"tags": [], "tags": [],
"templating": { "templating": {
"list": [ "list": []
{
"current": {
"selected": false,
"text": "prometheus",
"value": "edx8memhpd9tsa"
},
"hide": 0,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
}, },
"time": { "time": {
"from": "now-1h", "from": "now-1h",

View file

@ -37,7 +37,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "Total requests per second made to proxy - success + failure ", "description": "Total requests per second made to proxy - success + failure ",
"fieldConfig": { "fieldConfig": {
@ -119,7 +119,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
@ -138,7 +138,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "Failures per second by Exception Class", "description": "Failures per second by Exception Class",
"fieldConfig": { "fieldConfig": {
@ -220,7 +220,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
@ -239,7 +239,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "Average Response latency (seconds)", "description": "Average Response latency (seconds)",
"fieldConfig": { "fieldConfig": {
@ -346,7 +346,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
@ -361,7 +361,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.5, sum(rate(litellm_request_total_latency_metric_bucket[2m])) by (le))", "expr": "histogram_quantile(0.5, sum(rate(litellm_request_total_latency_metric_bucket[2m])) by (le))",
@ -391,7 +391,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "x-ratelimit-remaining-requests returning from LLM APIs", "description": "x-ratelimit-remaining-requests returning from LLM APIs",
"fieldConfig": { "fieldConfig": {
@ -473,7 +473,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "topk(5, sort(litellm_remaining_requests))", "expr": "topk(5, sort(litellm_remaining_requests))",
@ -488,7 +488,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "x-ratelimit-remaining-tokens from LLM API ", "description": "x-ratelimit-remaining-tokens from LLM API ",
"fieldConfig": { "fieldConfig": {
@ -570,7 +570,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "topk(5, sort(litellm_remaining_tokens))", "expr": "topk(5, sort(litellm_remaining_tokens))",
@ -598,7 +598,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "Requests per second by Key Alias (keys are LiteLLM Virtual Keys). If key is None - means no Alias Set ", "description": "Requests per second by Key Alias (keys are LiteLLM Virtual Keys). If key is None - means no Alias Set ",
"fieldConfig": { "fieldConfig": {
@ -679,7 +679,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(rate(litellm_proxy_total_requests_metric_total[2m])) by (api_key_alias)\n", "expr": "sum(rate(litellm_proxy_total_requests_metric_total[2m])) by (api_key_alias)\n",
@ -694,7 +694,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"description": "Requests per second by Team Alias. If team is None - means no team alias Set ", "description": "Requests per second by Team Alias. If team is None - means no team alias Set ",
"fieldConfig": { "fieldConfig": {
@ -775,7 +775,7 @@
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "${DS_PROMETHEUS}" "uid": "bdiyc60dco54we"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "sum(rate(litellm_proxy_total_requests_metric_total[2m])) by (team_alias)\n", "expr": "sum(rate(litellm_proxy_total_requests_metric_total[2m])) by (team_alias)\n",
@ -792,27 +792,7 @@
"schemaVersion": 40, "schemaVersion": 40,
"tags": [], "tags": [],
"templating": { "templating": {
"list": [ "list": []
{
"current": {
"selected": false,
"text": "prometheus",
"value": "edx8memhpd9tsb"
},
"hide": 0,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
}, },
"time": { "time": {
"from": "now-6h", "from": "now-6h",

View file

@ -16,7 +16,6 @@ spec:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
spec: spec:
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
containers: containers:
- name: prisma-migrations - name: prisma-migrations
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}" image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"

View file

@ -1,15 +1,13 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# /audio/transcriptions # Speech to Text
Use this to loadbalance across Azure + OpenAI. Use this to loadbalance across Azure + OpenAI.
## Quick Start ## Quick Start
### LiteLLM Python SDK ```python
```python showLineNumbers
from litellm import transcription from litellm import transcription
import os import os
@ -22,7 +20,7 @@ response = transcription(model="whisper", file=audio_file)
print(f"response: {response}") print(f"response: {response}")
``` ```
### LiteLLM Proxy ## Proxy Usage
### Add model to config ### Add model to config
@ -30,7 +28,7 @@ print(f"response: {response}")
<Tabs> <Tabs>
<TabItem value="openai" label="OpenAI"> <TabItem value="openai" label="OpenAI">
```yaml showLineNumbers ```yaml
model_list: model_list:
- model_name: whisper - model_name: whisper
litellm_params: litellm_params:
@ -45,7 +43,7 @@ general_settings:
</TabItem> </TabItem>
<TabItem value="openai+azure" label="OpenAI + Azure"> <TabItem value="openai+azure" label="OpenAI + Azure">
```yaml showLineNumbers ```yaml
model_list: model_list:
- model_name: whisper - model_name: whisper
litellm_params: litellm_params:
@ -90,9 +88,9 @@ curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
``` ```
</TabItem> </TabItem>
<TabItem value="openai" label="OpenAI Python SDK"> <TabItem value="openai" label="OpenAI">
```python showLineNumbers ```python
from openai import OpenAI from openai import OpenAI
client = openai.OpenAI( client = openai.OpenAI(
api_key="sk-1234", api_key="sk-1234",

View file

@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';
# Using Audio Models # Using Audio Models
How to send / receive audio to a `/chat/completions` endpoint How to send / receieve audio to a `/chat/completions` endpoint
## Audio Output from a model ## Audio Output from a model

View file

@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';
# Using PDF Input # Using PDF Input
How to send / receive pdf's (other document types) to a `/chat/completions` endpoint How to send / receieve pdf's (other document types) to a `/chat/completions` endpoint
Works for: Works for:
- Vertex AI models (Gemini + Anthropic) - Vertex AI models (Gemini + Anthropic)

View file

@ -194,7 +194,7 @@ Expected Response
## Explicitly specify image type ## Explicitly specify image type
If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicitly via the `format` param. If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param.
```python ```python
"image_url": { "image_url": {

View file

@ -1,15 +1,8 @@
# Images
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Image Generations
## Quick Start ## Quick Start
### LiteLLM Python SDK ```python
```python showLineNumbers
from litellm import image_generation from litellm import image_generation
import os import os
@ -21,23 +14,24 @@ response = image_generation(prompt="A cute baby sea otter", model="dall-e-3")
print(f"response: {response}") print(f"response: {response}")
``` ```
### LiteLLM Proxy ## Proxy Usage
### Setup config.yaml ### Setup config.yaml
```yaml showLineNumbers ```yaml
model_list: model_list:
- model_name: gpt-image-1 ### RECEIVED MODEL NAME ### - model_name: dall-e-2 ### RECEIVED MODEL NAME ###
litellm_params: # all params accepted by litellm.image_generation() litellm_params: # all params accepted by litellm.image_generation()
model: azure/gpt-image-1 ### MODEL NAME sent to `litellm.image_generation()` ### model: azure/dall-e-2 ### MODEL NAME sent to `litellm.image_generation()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU") api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
``` ```
### Start proxy ### Start proxy
```bash showLineNumbers ```bash
litellm --config /path/to/config.yaml litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000 # RUNNING on http://0.0.0.0:4000
@ -53,7 +47,7 @@ curl -X POST 'http://0.0.0.0:4000/v1/images/generations' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \ -H 'Authorization: Bearer sk-1234' \
-D '{ -D '{
"model": "gpt-image-1", "model": "dall-e-2",
"prompt": "A cute baby sea otter", "prompt": "A cute baby sea otter",
"n": 1, "n": 1,
"size": "1024x1024" "size": "1024x1024"
@ -63,7 +57,7 @@ curl -X POST 'http://0.0.0.0:4000/v1/images/generations' \
</TabItem> </TabItem>
<TabItem value="openai" label="OpenAI"> <TabItem value="openai" label="OpenAI">
```python showLineNumbers ```python
from openai import OpenAI from openai import OpenAI
client = openai.OpenAI( client = openai.OpenAI(
api_key="sk-1234", api_key="sk-1234",
@ -110,19 +104,15 @@ Any non-openai params, will be treated as provider-specific params, and sent in
litellm_logging_obj=None, litellm_logging_obj=None,
custom_llm_provider=None, custom_llm_provider=None,
- `model`: *string (optional)* The model to use for image generation. Defaults to openai/gpt-image-1 - `model`: *string (optional)* The model to use for image generation. Defaults to openai/dall-e-2
- `n`: *int (optional)* The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported. - `n`: *int (optional)* The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported.
- `quality`: *string (optional)* The quality of the image that will be generated. - `quality`: *string (optional)* The quality of the image that will be generated. hd creates images with finer details and greater consistency across the image. This param is only supported for dall-e-3.
* `auto` (default value) will automatically select the best quality for the given model.
* `high`, `medium` and `low` are supported for `gpt-image-1`.
* `hd` and `standard` are supported for `dall-e-3`.
* `standard` is the only option for `dall-e-2`.
- `response_format`: *string (optional)* The format in which the generated images are returned. Must be one of url or b64_json. - `response_format`: *string (optional)* The format in which the generated images are returned. Must be one of url or b64_json.
- `size`: *string (optional)* The size of the generated images. Must be one of `1024x1024`, `1536x1024` (landscape), `1024x1536` (portrait), or `auto` (default value) for `gpt-image-1`, one of `256x256`, `512x512`, or `1024x1024` for `dall-e-2`, and one of `1024x1024`, `1792x1024`, or `1024x1792` for `dall-e-3`. - `size`: *string (optional)* The size of the generated images. Must be one of 256x256, 512x512, or 1024x1024 for dall-e-2. Must be one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3 models.
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes). - `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
@ -158,14 +148,13 @@ Any non-openai params, will be treated as provider-specific params, and sent in
from litellm import image_generation from litellm import image_generation
import os import os
os.environ['OPENAI_API_KEY'] = "" os.environ['OPENAI_API_KEY'] = ""
response = image_generation(model='gpt-image-1', prompt="cute baby otter") response = image_generation(model='dall-e-2', prompt="cute baby otter")
``` ```
| Model Name | Function Call | Required OS Variables | | Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------| |----------------------|---------------------------------------------|--------------------------------------|
| gpt-image-1 | `image_generation(model='gpt-image-1', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
| dall-e-3 | `image_generation(model='dall-e-3', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
| dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` | | dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
| dall-e-3 | `image_generation(model='dall-e-3', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Image Generation Models ## Azure OpenAI Image Generation Models
@ -193,9 +182,8 @@ print(response)
| Model Name | Function Call | | Model Name | Function Call |
|----------------------|---------------------------------------------| |----------------------|---------------------------------------------|
| gpt-image-1 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
| dall-e-3 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
| dall-e-2 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` | | dall-e-2 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
| dall-e-3 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
## OpenAI Compatible Image Generation Models ## OpenAI Compatible Image Generation Models

View file

@ -1,83 +0,0 @@
# 🖇️ AgentOps - LLM Observability Platform
:::tip
This is community maintained. Please make an issue if you run into a bug:
https://github.com/BerriAI/litellm
:::
[AgentOps](https://docs.agentops.ai) is an observability platform that enables tracing and monitoring of LLM calls, providing detailed insights into your AI operations.
## Using AgentOps with LiteLLM
LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily integrate AgentOps for comprehensive tracing and monitoring of your LLM operations.
### Integration
Use just a few lines of code to instantly trace your responses **across all providers** with AgentOps:
Get your AgentOps API Keys from https://app.agentops.ai/
```python
import litellm
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# Make your LLM calls as usual
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
Complete Code:
```python
import os
from litellm import completion
# Set env variables
os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
# Configure LiteLLM to use AgentOps
litellm.success_callback = ["agentops"]
# OpenAI call
response = completion(
model="gpt-4",
messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
)
print(response)
```
### Configuration Options
The AgentOps integration can be configured through environment variables:
- `AGENTOPS_API_KEY` (str, optional): Your AgentOps API key
- `AGENTOPS_ENVIRONMENT` (str, optional): Deployment environment (defaults to "production")
- `AGENTOPS_SERVICE_NAME` (str, optional): Service name for tracing (defaults to "agentops")
### Advanced Usage
You can configure additional settings through environment variables:
```python
import os
# Configure AgentOps settings
os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
os.environ["AGENTOPS_ENVIRONMENT"] = "staging"
os.environ["AGENTOPS_SERVICE_NAME"] = "my-service"
# Enable AgentOps tracing
litellm.success_callback = ["agentops"]
```
### Support
For issues or questions, please refer to:
- [AgentOps Documentation](https://docs.agentops.ai)
- [LiteLLM Documentation](https://docs.litellm.ai)

View file

@ -53,7 +53,7 @@ response = completion(
## Additional information in metadata ## Additional information in metadata
You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, environment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields. You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, enviornment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields.
```python ```python
#openai call with additional metadata #openai call with additional metadata

View file

@ -185,7 +185,7 @@ curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \
* `trace_release` - Release for the trace, defaults to `None` * `trace_release` - Release for the trace, defaults to `None`
* `trace_metadata` - Metadata for the trace, defaults to `None` * `trace_metadata` - Metadata for the trace, defaults to `None`
* `trace_user_id` - User identifier for the trace, defaults to completion argument `user` * `trace_user_id` - User identifier for the trace, defaults to completion argument `user`
* `tags` - Tags for the trace, defaults to `None` * `tags` - Tags for the trace, defeaults to `None`
##### Updatable Parameters on Continuation ##### Updatable Parameters on Continuation

View file

@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f
| Feature | Supported | Notes | | Feature | Supported | Notes |
|-------|-------|-------| |-------|-------|-------|
| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` | | Cost Tracking | ✅ | works across all integrations |
| Logging | ✅ | works across all integrations | | Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | | | Streaming | ✅ | |

View file

@ -1,217 +0,0 @@
# Mistral
Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀
#### **Example Usage**
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
Supports **ALL** Mistral Endpoints (including streaming).
## Quick Start
Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post)
1. Add MISTRAL_API_KEY to your environment
```bash
export MISTRAL_API_KEY="sk-1234"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Mistral `/ocr` endpoint
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
## Examples
Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://api.mistral.ai/v1` | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $MISTRAL_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: OCR endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_API_KEY' \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "image_url",
"image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
}
}'
```
#### Direct Mistral API Call
```bash
curl https://api.mistral.ai/v1/ocr \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${MISTRAL_API_KEY}" \
-d '{
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": "https://arxiv.org/pdf/2201.04234"
},
"include_image_base64": true
}'
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
#### Direct Mistral API Call
```bash
curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "mistral-large-latest",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export MISTRAL_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -13,15 +13,6 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | | | Streaming | ✅ | |
## Supported Endpoints
LiteLLM supports 2 vertex ai passthrough routes:
1. `/vertex_ai` → routes to `https://{vertex_location}-aiplatform.googleapis.com/`
2. `/vertex_ai/discovery` → routes to [`https://discoveryengine.googleapis.com`](https://discoveryengine.googleapis.com/)
## How to use
Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai` Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai`
LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through: LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through:
@ -222,7 +213,7 @@ curl http://localhost:4000/vertex-ai/v1/projects/${PROJECT_ID}/locations/us-cent
LiteLLM Proxy Server supports two methods of authentication to Vertex AI: LiteLLM Proxy Server supports two methods of authentication to Vertex AI:
1. Pass Vertex Credentials client side to proxy server 1. Pass Vertex Credetials client side to proxy server
2. Set Vertex AI credentials on proxy server 2. Set Vertex AI credentials on proxy server

View file

@ -1,185 +0,0 @@
# VLLM
Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation).
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ❌ | Not supported |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
| Streaming | ✅ | |
Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀
#### **Example Usage**
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
Supports **ALL** VLLM Endpoints (including streaming).
## Quick Start
Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html)
1. Add HOSTED VLLM API BASE to your environment
```bash
export HOSTED_VLLM_API_BASE="https://my-vllm-server.com"
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the VLLM `/metrics` endpoint
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
```
## Examples
Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://my-vllm-server.com` | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $VLLM_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Metrics endpoint**
#### LiteLLM Proxy Call
```bash
curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
```
#### Direct VLLM API Call
```bash
curl -L -X GET 'https://my-vllm-server.com/metrics' \
-H 'Content-Type: application/json' \
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
#### Direct VLLM API Call
```bash
curl -L -X POST 'https://my-vllm-server.com/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export HOSTED_VLLM_API_BASE=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
--data '{
"messages": [
{
"role": "user",
"content": "I am going to Paris, what should I see?"
}
],
"max_tokens": 2048,
"temperature": 0.8,
"top_p": 0.1,
"model": "qwen2.5-7b-instruct",
}'
```

View file

@ -1095,7 +1095,7 @@ response = completion(
print(response.choices[0]) print(response.choices[0])
``` ```
</TabItem> </TabItem>
<TabItem value="proxy" label="PROXY"> <TabItem value="proxy" lable="PROXY">
1. Add model to config 1. Add model to config

View file

@ -483,7 +483,7 @@ response.stream_to_file(speech_file_path)
This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls
Step 1 - Download Azure CLI Step 1 - Download Azure CLI
Installation instructions: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli Installation instructons: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
```shell ```shell
brew update && brew install azure-cli brew update && brew install azure-cli
``` ```
@ -1002,125 +1002,8 @@ Expected Response:
``` ```
## **Azure Responses API**
| Property | Details |
|-------|-------|
| Description | Azure OpenAI Responses API |
| `custom_llm_provider` on LiteLLM | `azure/` |
| Supported Operations | `/v1/responses`|
| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) |
| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests |
| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) |
## Usage
## Create a model response
<Tabs>
<TabItem value="litellm-sdk" label="LiteLLM SDK">
#### Non-streaming
```python showLineNumbers title="Azure Responses API"
import litellm
# Non-streaming response
response = litellm.responses(
model="azure/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100,
api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
api_base="https://litellm8397336933.openai.azure.com/",
api_version="2023-03-15-preview",
)
print(response)
```
#### Streaming
```python showLineNumbers title="Azure Responses API"
import litellm
# Streaming response
response = litellm.responses(
model="azure/o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True,
api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
api_base="https://litellm8397336933.openai.azure.com/",
api_version="2023-03-15-preview",
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Azure Responses API"
model_list:
- model_name: o1-pro
litellm_params:
model: azure/o1-pro
api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
api_base: https://litellm8397336933.openai.azure.com/
api_version: 2023-03-15-preview
```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
#### Non-streaming
```python showLineNumbers
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>

View file

@ -13,9 +13,7 @@ os.environ["COHERE_API_KEY"] = ""
## Usage ## Usage
### LiteLLM Python SDK ```python
```python showLineNumbers
from litellm import completion from litellm import completion
## set ENV variables ## set ENV variables
@ -28,9 +26,9 @@ response = completion(
) )
``` ```
#### Streaming ## Usage - Streaming
```python showLineNumbers ```python
from litellm import completion from litellm import completion
## set ENV variables ## set ENV variables
@ -48,90 +46,15 @@ for chunk in response:
``` ```
## Usage with LiteLLM Proxy
Here's how to call Cohere with the LiteLLM Proxy Server
### 1. Save key in your environment
```bash
export COHERE_API_KEY="your-api-key"
```
### 2. Start the proxy
Define the cohere models you want to use in the config.yaml
```yaml showLineNumbers
model_list:
- model_name: command-a-03-2025
litellm_params:
model: command-a-03-2025
api_key: "os.environ/COHERE_API_KEY"
```
```bash
litellm --config /path/to/config.yaml
```
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell showLineNumbers
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <your-litellm-api-key>' \
--data ' {
"model": "command-a-03-2025",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python showLineNumbers
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy
response = client.chat.completions.create(model="command-a-03-2025", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
</Tabs>
## Supported Models ## Supported Models
| Model Name | Function Call | | Model Name | Function Call |
|------------|----------------| |------------|----------------|
| command-a-03-2025 | `litellm.completion('command-a-03-2025', messages)` | | command-r-plus-08-2024 | `completion('command-r-plus-08-2024', messages)` |
| command-r-plus-08-2024 | `litellm.completion('command-r-plus-08-2024', messages)` | | command-r-08-2024 | `completion('command-r-08-2024', messages)` |
| command-r-08-2024 | `litellm.completion('command-r-08-2024', messages)` | | command-r-plus | `completion('command-r-plus', messages)` |
| command-r-plus | `litellm.completion('command-r-plus', messages)` | | command-r | `completion('command-r', messages)` |
| command-r | `litellm.completion('command-r', messages)` | | command-light | `completion('command-light', messages)` |
| command-light | `litellm.completion('command-light', messages)` | | command-nightly | `completion('command-nightly', messages)` |
| command-nightly | `litellm.completion('command-nightly', messages)` |
## Embedding ## Embedding

View file

@ -39,164 +39,14 @@ response = completion(
- temperature - temperature
- top_p - top_p
- max_tokens - max_tokens
- max_completion_tokens
- stream - stream
- tools - tools
- tool_choice - tool_choice
- functions
- response_format - response_format
- n - n
- stop - stop
- logprobs
- frequency_penalty
- modalities
- reasoning_content
**Anthropic Params**
- thinking (used to set max budget tokens across anthropic/gemini models)
[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70)
## Usage - Thinking / `reasoning_content`
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: gemini/gemini-2.5-flash-preview-04-17
api_key: os.environ/GEMINI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = litellm.completion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "gemini/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
## Passing Gemini Specific Params ## Passing Gemini Specific Params
### Response schema ### Response schema
@ -655,7 +505,7 @@ import os
os.environ["GEMINI_API_KEY"] = ".." os.environ["GEMINI_API_KEY"] = ".."
tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH tools = [{"googleSearchRetrieval": {}}] # 👈 ADD GOOGLE SEARCH
response = completion( response = completion(
model="gemini/gemini-2.0-flash", model="gemini/gemini-2.0-flash",
@ -691,7 +541,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-d '{ -d '{
"model": "gemini-2.0-flash", "model": "gemini-2.0-flash",
"messages": [{"role": "user", "content": "What is the weather in San Francisco?"}], "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
"tools": [{"googleSearch": {}}] "tools": [{"googleSearchRetrieval": {}}]
} }
' '
``` ```

View file

@ -4,16 +4,17 @@ import TabItem from '@theme/TabItem';
# Infinity # Infinity
| Property | Details | | Property | Details |
| ------------------------- | ---------------------------------------------------------------------------------------------------------- | |-------|-------|
| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip| | Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip|
| Provider Route on LiteLLM | `infinity/` | | Provider Route on LiteLLM | `infinity/` |
| Supported Operations | `/rerank`, `/embeddings` | | Supported Operations | `/rerank` |
| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) | | Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) |
## **Usage - LiteLLM Python SDK** ## **Usage - LiteLLM Python SDK**
```python ```python
from litellm import rerank, embedding from litellm import rerank
import os import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080" os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
@ -38,8 +39,8 @@ model_list:
- model_name: custom-infinity-rerank - model_name: custom-infinity-rerank
litellm_params: litellm_params:
model: infinity/rerank model: infinity/rerank
api_base: https://localhost:8080
api_key: os.environ/INFINITY_API_KEY api_key: os.environ/INFINITY_API_KEY
api_base: https://localhost:8080
``` ```
Start litellm Start litellm
@ -50,9 +51,7 @@ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000 # RUNNING on http://0.0.0.0:4000
``` ```
## Test request: Test request
### Rerank
```bash ```bash
curl http://0.0.0.0:4000/rerank \ curl http://0.0.0.0:4000/rerank \
@ -71,10 +70,11 @@ curl http://0.0.0.0:4000/rerank \
}' }'
``` ```
#### Supported Cohere Rerank API Params
## Supported Cohere Rerank API Params
| Param | Type | Description | | Param | Type | Description |
| ------------------ | ----------- | ----------------------------------------------- | |-------|-------|-------|
| `query` | `str` | The query to rerank the documents against | | `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank | | `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return | | `top_n` | `int` | The number of documents to return |
@ -138,7 +138,6 @@ response = rerank(
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
) )
``` ```
</TabItem> </TabItem>
<TabItem value="proxy" label="PROXY"> <TabItem value="proxy" label="PROXY">
@ -180,121 +179,6 @@ curl http://0.0.0.0:4000/rerank \
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}' }'
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>
## Embeddings
LiteLLM provides an OpenAI api compatible `/embeddings` endpoint for embedding calls.
**Setup**
Add this to your litellm proxy config.yaml
```yaml
model_list:
- model_name: custom-infinity-embedding
litellm_params:
model: infinity/provider/custom-embedding-v1
api_base: http://localhost:8080
api_key: os.environ/INFINITY_API_KEY
```
### Test request:
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
#### Supported Embedding API Params
| Param | Type | Description |
| ----------------- | ----------- | ----------------------------------------------------------- |
| `model` | `str` | The embedding model to use |
| `input` | `list[str]` | The text inputs to generate embeddings for |
| `encoding_format` | `str` | The format to return embeddings in (e.g. "float", "base64") |
| `modality` | `str` | The type of input (e.g. "text", "image", "audio") |
### Usage - Basic Examples
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import embedding
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = embedding(
model="infinity/bge-small",
input=["good morning from litellm"]
)
print(response.data[0]['embedding'])
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-embedding",
"input": ["hello"]
}'
```
</TabItem>
</Tabs>
### Usage - OpenAI Client
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from openai import OpenAI
client = OpenAI(
api_key="<LITELLM_MASTER_KEY>",
base_url="<LITELLM_URL>"
)
response = client.embeddings.create(
model="bge-small",
input=["The food was delicious and the waiter..."],
encoding_format="float"
)
print(response.data[0].embedding)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/embeddings \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "bge-small",
"input": ["The food was delicious and the waiter..."],
"encoding_format": "float"
}'
```
</TabItem>
</Tabs>

View file

@ -163,12 +163,6 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call | | Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------| |-----------------------|-----------------------------------------------------------------|
| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` |
| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` |
| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` |
| o4-mini | `response = completion(model="o4-mini", messages=messages)` |
| o3-mini | `response = completion(model="o3-mini", messages=messages)` |
| o3 | `response = completion(model="o3", messages=messages)` |
| o1-mini | `response = completion(model="o1-mini", messages=messages)` | | o1-mini | `response = completion(model="o1-mini", messages=messages)` |
| o1-preview | `response = completion(model="o1-preview", messages=messages)` | | o1-preview | `response = completion(model="o1-preview", messages=messages)` |
| gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` | | gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` |

View file

@ -364,7 +364,7 @@ from litellm import completion
## SETUP ENVIRONMENT ## SETUP ENVIRONMENT
# !gcloud auth application-default login - run this to add vertex credentials to your env # !gcloud auth application-default login - run this to add vertex credentials to your env
tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH tools = [{"googleSearchRetrieval": {}}] # 👈 ADD GOOGLE SEARCH
resp = litellm.completion( resp = litellm.completion(
model="vertex_ai/gemini-1.0-pro-001", model="vertex_ai/gemini-1.0-pro-001",
@ -391,7 +391,7 @@ client = OpenAI(
response = client.chat.completions.create( response = client.chat.completions.create(
model="gemini-pro", model="gemini-pro",
messages=[{"role": "user", "content": "Who won the world cup?"}], messages=[{"role": "user", "content": "Who won the world cup?"}],
tools=[{"googleSearch": {}}], tools=[{"googleSearchRetrieval": {}}],
) )
print(response) print(response)
@ -410,7 +410,7 @@ curl http://localhost:4000/v1/chat/completions \
], ],
"tools": [ "tools": [
{ {
"googleSearch": {} "googleSearchRetrieval": {}
} }
] ]
}' }'
@ -529,7 +529,7 @@ from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env # !gcloud auth application-default login - run this to add vertex credentials to your env
tools = [{"googleSearch": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH tools = [{"googleSearchRetrieval": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH
resp = litellm.completion( resp = litellm.completion(
model="vertex_ai/gemini-1.0-pro-001", model="vertex_ai/gemini-1.0-pro-001",
@ -542,157 +542,9 @@ print(resp)
``` ```
### **Thinking / `reasoning_content`**
LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
**Mapping**
| reasoning_effort | thinking |
| ---------------- | -------- |
| "low" | "budget_tokens": 1024 |
| "medium" | "budget_tokens": 2048 |
| "high" | "budget_tokens": 4096 |
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
resp = completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
reasoning_effort="low",
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: gemini-2.5-flash
litellm_params:
model: vertex_ai/gemini-2.5-flash-preview-04-17
vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"}
vertex_project: "project-id"
vertex_location: "us-central1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "gemini-2.5-flash",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"reasoning_effort": "low"
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
),
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
#### Pass `thinking` to Gemini models
You can also pass the `thinking` parameter to Gemini models.
This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
response = litellm.completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
vertex_project="project-id",
vertex_location="us-central1"
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "vertex_ai/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
### **Context Caching** ### **Context Caching**
Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support coming soon.). Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.).
[**Go straight to provider**](../pass_through/vertex_ai.md#context-caching) [**Go straight to provider**](../pass_through/vertex_ai.md#context-caching)
@ -910,7 +762,7 @@ export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is differe
## Specifying Safety Settings ## Specifying Safety Settings
In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example: In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
### Set per model/request ### Set per model/request
@ -2050,7 +1902,7 @@ response = completion(
print(response.choices[0]) print(response.choices[0])
``` ```
</TabItem> </TabItem>
<TabItem value="proxy" label="PROXY"> <TabItem value="proxy" lable="PROXY">
1. Add model to config 1. Add model to config

View file

@ -161,120 +161,6 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020) Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
<Tabs>
<TabItem value="files_message" label="(Unified) Files Message">
Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type.
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "file", "file": {"file_id": video_url}},
```
2. Pass the video data as base64
```
{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "file",
"file": {
"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
]
# call vllm
os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co"
os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=messages,
)
# call gemini
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
response = completion(
model="gemini/gemini-1.5-flash", # pass the gemini model name
messages=messages,
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
- model_name: my-gemini-model
litellm_params:
model: gemini/gemini-1.5-flash # add gemini/ prefix to route as Google AI Studio provider
api_key: os.environ/GEMINI_API_KEY
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="video_url" label="(VLLM-specific) Video Message">
Use this to send a video url to VLLM in it's native message format (`video_url`).
There are two ways to send a video url to VLLM: There are two ways to send a video url to VLLM:
1. Pass the video url directly 1. Pass the video url directly
@ -363,10 +249,6 @@ curl -X POST http://0.0.0.0:4000/chat/completions \
</Tabs> </Tabs>
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package` ## (Deprecated) for `vllm pip package`
### Using - `litellm.completion` ### Using - `litellm.completion`

View file

@ -243,12 +243,12 @@ We allow you to pass a local image or a an http/https url of your image
Set `UI_LOGO_PATH` on your env. We recommend using a hosted image, it's a lot easier to set up and configure / debug Set `UI_LOGO_PATH` on your env. We recommend using a hosted image, it's a lot easier to set up and configure / debug
Example setting Hosted image Exaple setting Hosted image
```shell ```shell
UI_LOGO_PATH="https://litellm-logo-aws-marketplace.s3.us-west-2.amazonaws.com/berriai-logo-github.png" UI_LOGO_PATH="https://litellm-logo-aws-marketplace.s3.us-west-2.amazonaws.com/berriai-logo-github.png"
``` ```
Example setting a local image (on your container) Exaple setting a local image (on your container)
```shell ```shell
UI_LOGO_PATH="ui_images/logo.jpg" UI_LOGO_PATH="ui_images/logo.jpg"
``` ```

View file

@ -213,7 +213,7 @@ model_list:
general_settings: general_settings:
master_key: sk-1234 master_key: sk-1234
alerting: ["slack"] alerting: ["slack"]
alerting_threshold: 0.0001 # (Seconds) set an artificially low threshold for testing alerting alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
alert_to_webhook_url: { alert_to_webhook_url: {
"llm_exceptions": "https://hooks.slack.com/services/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", "llm_exceptions": "https://hooks.slack.com/services/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",
"llm_too_slow": "https://hooks.slack.com/services/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", "llm_too_slow": "https://hooks.slack.com/services/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",
@ -247,7 +247,7 @@ model_list:
general_settings: general_settings:
master_key: sk-1234 master_key: sk-1234
alerting: ["slack"] alerting: ["slack"]
alerting_threshold: 0.0001 # (Seconds) set an artificially low threshold for testing alerting alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
alert_to_webhook_url: { alert_to_webhook_url: {
"llm_exceptions": ["os.environ/SLACK_WEBHOOK_URL", "os.environ/SLACK_WEBHOOK_URL_2"], "llm_exceptions": ["os.environ/SLACK_WEBHOOK_URL", "os.environ/SLACK_WEBHOOK_URL_2"],
"llm_too_slow": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"], "llm_too_slow": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"],
@ -425,7 +425,7 @@ curl -X GET --location 'http://0.0.0.0:4000/health/services?service=webhook' \
- `projected_exceeded_date` *str or null*: The date when the budget is projected to be exceeded, returned when 'soft_budget' is set for key (optional). - `projected_exceeded_date` *str or null*: The date when the budget is projected to be exceeded, returned when 'soft_budget' is set for key (optional).
- `projected_spend` *float or null*: The projected spend amount, returned when 'soft_budget' is set for key (optional). - `projected_spend` *float or null*: The projected spend amount, returned when 'soft_budget' is set for key (optional).
- `event` *Literal["budget_crossed", "threshold_crossed", "projected_limit_exceeded"]*: The type of event that triggered the webhook. Possible values are: - `event` *Literal["budget_crossed", "threshold_crossed", "projected_limit_exceeded"]*: The type of event that triggered the webhook. Possible values are:
* "spend_tracked": Emitted whenever spend is tracked for a customer id. * "spend_tracked": Emitted whenver spend is tracked for a customer id.
* "budget_crossed": Indicates that the spend has exceeded the max budget. * "budget_crossed": Indicates that the spend has exceeded the max budget.
* "threshold_crossed": Indicates that spend has crossed a threshold (currently sent when 85% and 95% of budget is reached). * "threshold_crossed": Indicates that spend has crossed a threshold (currently sent when 85% and 95% of budget is reached).
* "projected_limit_exceeded": For "key" only - Indicates that the projected spend is expected to exceed the soft budget threshold. * "projected_limit_exceeded": For "key" only - Indicates that the projected spend is expected to exceed the soft budget threshold.
@ -480,7 +480,7 @@ LLM-related Alerts
| `cooldown_deployment` | Alerts when a deployment is put into cooldown | ✅ | | `cooldown_deployment` | Alerts when a deployment is put into cooldown | ✅ |
| `new_model_added` | Notifications when a new model is added to litellm proxy through /model/new| ✅ | | `new_model_added` | Notifications when a new model is added to litellm proxy through /model/new| ✅ |
| `outage_alerts` | Alerts when a specific LLM deployment is facing an outage | ✅ | | `outage_alerts` | Alerts when a specific LLM deployment is facing an outage | ✅ |
| `region_outage_alerts` | Alerts when a specific LLM region is facing an outage. Example us-east-1 | ✅ | | `region_outage_alerts` | Alerts when a specfic LLM region is facing an outage. Example us-east-1 | ✅ |
Budget and Spend Alerts Budget and Spend Alerts

View file

@ -299,9 +299,6 @@ router_settings:
|------|-------------| |------|-------------|
| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions | ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions | ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
| AGENTOPS_ENVIRONMENT | Environment for AgentOps logging integration
| AGENTOPS_API_KEY | API Key for AgentOps logging integration
| AGENTOPS_SERVICE_NAME | Service Name for AgentOps logging integration
| AISPEND_ACCOUNT_ID | Account ID for AI Spend | AISPEND_ACCOUNT_ID | Account ID for AI Spend
| AISPEND_API_KEY | API Key for AI Spend | AISPEND_API_KEY | API Key for AI Spend
| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access | ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
@ -442,7 +439,6 @@ router_settings:
| LITELLM_EMAIL | Email associated with LiteLLM account | LITELLM_EMAIL | Email associated with LiteLLM account
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
| LITELLM_MIGRATION_DIR | Custom migrations directory for prisma migrations, used for baselining db in read-only file systems.
| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM | LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM
| LITELLM_LICENSE | License key for LiteLLM usage | LITELLM_LICENSE | License key for LiteLLM usage
| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM | LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM

View file

@ -56,7 +56,7 @@ model_list:
model: azure/<your_deployment_name> model: azure/<your_deployment_name>
api_key: os.environ/AZURE_API_KEY api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE api_base: os.environ/AZURE_API_BASE
api_version: os.environ/AZURE_API_VERSION api_version: os.envrion/AZURE_API_VERSION
model_info: model_info:
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token

View file

@ -19,7 +19,7 @@ LiteLLM writes `UPDATE` and `UPSERT` queries to the DB. When using 10+ instances
### Stage 1. Each instance writes updates to redis ### Stage 1. Each instance writes updates to redis
Each instance will accumulate the spend updates for a key, user, team, etc and write the updates to a redis queue. Each instance will accumlate the spend updates for a key, user, team, etc and write the updates to a redis queue.
<Image img={require('../../img/deadlock_fix_1.png')} style={{ width: '900px', height: 'auto' }} /> <Image img={require('../../img/deadlock_fix_1.png')} style={{ width: '900px', height: 'auto' }} />
<p style={{textAlign: 'left', color: '#666'}}> <p style={{textAlign: 'left', color: '#666'}}>

View file

@ -22,7 +22,7 @@ echo 'LITELLM_MASTER_KEY="sk-1234"' > .env
# Add the litellm salt key - you cannot change this after adding a model # Add the litellm salt key - you cannot change this after adding a model
# It is used to encrypt / decrypt your LLM API Key credentials # It is used to encrypt / decrypt your LLM API Key credentials
# We recommend - https://1password.com/password-generator/ # We recommned - https://1password.com/password-generator/
# password generator to get a random hash for litellm salt key # password generator to get a random hash for litellm salt key
echo 'LITELLM_SALT_KEY="sk-1234"' >> .env echo 'LITELLM_SALT_KEY="sk-1234"' >> .env
@ -125,7 +125,7 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
### Build from litellm `pip` package ### Build from litellm `pip` package
Follow these instructions to build a docker container from the litellm pip package. If your company has a strict requirement around security / building images you can follow these steps. Follow these instructons to build a docker container from the litellm pip package. If your company has a strict requirement around security / building images you can follow these steps.
Dockerfile Dockerfile
@ -999,7 +999,7 @@ services:
- "4000:4000" # Map the container port to the host, change the host port if necessary - "4000:4000" # Map the container port to the host, change the host port if necessary
volumes: volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI argument. Make sure the port passed here matches with the container port defined above in `ports` value # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ] command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any # ...rest of your docker-compose config if any

View file

@ -691,7 +691,7 @@ curl --request POST \
<TabItem value="admin_only_routes" label="Test `admin_only_routes`"> <TabItem value="admin_only_routes" label="Test `admin_only_routes`">
**Successful Request** **Successfull Request**
```shell ```shell
curl --location 'http://0.0.0.0:4000/key/generate' \ curl --location 'http://0.0.0.0:4000/key/generate' \
@ -729,7 +729,7 @@ curl --location 'http://0.0.0.0:4000/key/generate' \
<TabItem value="allowed_routes" label="Test `allowed_routes`"> <TabItem value="allowed_routes" label="Test `allowed_routes`">
**Successful Request** **Successfull Request**
```shell ```shell
curl http://localhost:4000/chat/completions \ curl http://localhost:4000/chat/completions \

View file

@ -164,7 +164,7 @@ curl -i http://localhost:4000/v1/chat/completions \
**Expected response** **Expected response**
Your response headers will include `x-litellm-applied-guardrails` with the guardrail applied Your response headers will incude `x-litellm-applied-guardrails` with the guardrail applied
``` ```
x-litellm-applied-guardrails: aporia-pre-guard x-litellm-applied-guardrails: aporia-pre-guard

View file

@ -277,7 +277,7 @@ Found under `kwargs["standard_logging_object"]`. This is a standard payload, log
## Langfuse ## Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successful LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse **Step 1** Install langfuse
@ -535,8 +535,8 @@ print(response)
Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields
| LiteLLM specific field | Description | Example Value | | LiteLLM specific field | Description | Example Value |
|---------------------------|-----------------------------------------------------------------------------------------|------------------------------------------------| |------------------------|-------------------------------------------------------|------------------------------------------------|
| `cache_hit` | Indicates whether a cache hit occurred (True) or not (False) | `true`, `false` | | `cache_hit` | Indicates whether a cache hit occured (True) or not (False) | `true`, `false` |
| `cache_key` | The Cache key used for this request | `d2b758c****`| | `cache_key` | The Cache key used for this request | `d2b758c****`|
| `proxy_base_url` | The base URL for the proxy server, the value of env var `PROXY_BASE_URL` on your server | `https://proxy.example.com`| | `proxy_base_url` | The base URL for the proxy server, the value of env var `PROXY_BASE_URL` on your server | `https://proxy.example.com`|
| `user_api_key_alias` | An alias for the LiteLLM Virtual Key.| `prod-app1` | | `user_api_key_alias` | An alias for the LiteLLM Virtual Key.| `prod-app1` |
@ -1190,7 +1190,7 @@ We will use the `--config` to set
- `litellm.success_callback = ["s3"]` - `litellm.success_callback = ["s3"]`
This will log all successful LLM calls to s3 Bucket This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env **Step 1** Set AWS Credentials in .env
@ -1279,7 +1279,7 @@ Log LLM Logs to [Azure Data Lake Storage](https://learn.microsoft.com/en-us/azur
| Property | Details | | Property | Details |
|----------|---------| |----------|---------|
| Description | Log LLM Input/Output to Azure Blob Storage (Bucket) | | Description | Log LLM Input/Output to Azure Blob Storag (Bucket) |
| Azure Docs on Data Lake Storage | [Azure Data Lake Storage](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) | | Azure Docs on Data Lake Storage | [Azure Data Lake Storage](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) |
@ -1360,7 +1360,7 @@ LiteLLM Supports logging to the following Datdog Integrations:
<Tabs> <Tabs>
<TabItem value="datadog" label="Datadog Logs"> <TabItem value="datadog" label="Datadog Logs">
We will use the `--config` to set `litellm.callbacks = ["datadog"]` this will log all successful LLM calls to DataDog We will use the `--config` to set `litellm.callbacks = ["datadog"]` this will log all successfull LLM calls to DataDog
**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
@ -1636,7 +1636,7 @@ class MyCustomHandler(CustomLogger):
litellm_params = kwargs.get("litellm_params", {}) litellm_params = kwargs.get("litellm_params", {})
metadata = litellm_params.get("metadata", {}) # headers passed to LiteLLM proxy, can be found here metadata = litellm_params.get("metadata", {}) # headers passed to LiteLLM proxy, can be found here
# Access Exceptions & Traceback # Acess Exceptions & Traceback
exception_event = kwargs.get("exception", None) exception_event = kwargs.get("exception", None)
traceback_event = kwargs.get("traceback_exception", None) traceback_event = kwargs.get("traceback_exception", None)
@ -2205,7 +2205,7 @@ We will use the `--config` to set
- `litellm.success_callback = ["dynamodb"]` - `litellm.success_callback = ["dynamodb"]`
- `litellm.dynamodb_table_name = "your-table-name"` - `litellm.dynamodb_table_name = "your-table-name"`
This will log all successful LLM calls to DynamoDB This will log all successfull LLM calls to DynamoDB
**Step 1** Set AWS Credentials in .env **Step 1** Set AWS Credentials in .env
@ -2370,7 +2370,7 @@ litellm --test
[Athina](https://athina.ai/) allows you to log LLM Input/Output for monitoring, analytics, and observability. [Athina](https://athina.ai/) allows you to log LLM Input/Output for monitoring, analytics, and observability.
We will use the `--config` to set `litellm.success_callback = ["athina"]` this will log all successful LLM calls to athina We will use the `--config` to set `litellm.success_callback = ["athina"]` this will log all successfull LLM calls to athina
**Step 1** Set Athina API key **Step 1** Set Athina API key

View file

@ -1,108 +0,0 @@
# Model Discovery
Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models.
## Supported Models
- Fireworks AI
- OpenAI
- Gemini
- LiteLLM Proxy
- Topaz
- Anthropic
- XAI
- VLLM
- Vertex AI
### Usage
**1. Setup config.yaml**
```yaml
model_list:
- model_name: xai/*
litellm_params:
model: xai/*
api_key: os.environ/XAI_API_KEY
litellm_settings:
check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**3. Call `/v1/models`**
```bash
curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY"
```
Expected response
```json
{
"data": [
{
"id": "xai/grok-2-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-vision-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-3-mini-fast-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-vision-beta",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
},
{
"id": "xai/grok-2-image-1212",
"object": "model",
"created": 1677610602,
"owned_by": "openai"
}
],
"object": "list"
}
```

View file

@ -61,7 +61,7 @@ CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
## 3. Use Redis 'port','host', 'password'. NOT 'redis_url' ## 3. Use Redis 'port','host', 'password'. NOT 'redis_url'
If you decide to use Redis, DO NOT use 'redis_url'. We recommend using redis port, host, and password params. If you decide to use Redis, DO NOT use 'redis_url'. We recommend usig redis port, host, and password params.
`redis_url`is 80 RPS slower `redis_url`is 80 RPS slower
@ -169,7 +169,7 @@ If you plan on using the DB, set a salt key for encrypting/decrypting variables
Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials
We recommend - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key. We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
```bash ```bash
export LITELLM_SALT_KEY="sk-1234" export LITELLM_SALT_KEY="sk-1234"

View file

@ -3,7 +3,7 @@
Set temporary budget increase for a LiteLLM Virtual Key. Use this if you get asked to increase the budget for a key temporarily. Set temporary budget increase for a LiteLLM Virtual Key. Use this if you get asked to increase the budget for a key temporarily.
| Hierarchy | Supported | | Heirarchy | Supported |
|-----------|-----------| |-----------|-----------|
| LiteLLM Virtual Key | ✅ | | LiteLLM Virtual Key | ✅ |
| User | ❌ | | User | ❌ |

View file

@ -4,7 +4,7 @@ import TabItem from '@theme/TabItem';
# Adding LLM Credentials # Adding LLM Credentials
You can add LLM provider credentials on the UI. Once you add credentials you can reuse them when adding new models You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
## Add a credential + model ## Add a credential + model

View file

@ -3,7 +3,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Getting Started with UI Logs # UI Logs Page
View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM View Spend, Token Usage, Key, Team Name for Each Request to LiteLLM
@ -52,3 +52,4 @@ If you do not want to store spend logs in DB, you can opt out with this setting
general_settings: general_settings:
disable_spend_logs: True # Disable writing spend logs to DB disable_spend_logs: True # Disable writing spend logs to DB
``` ```

View file

@ -1,320 +0,0 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Session Logs
Group requests into sessions. This allows you to group related requests together.
<Image img={require('../../img/ui_session_logs.png')}/>
## Usage
### `/chat/completions`
To group multiple requests into a single session, pass the same `litellm_trace_id` in the metadata for each request. Here's how to do it:
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
**Request 1**
Create a new session with a unique ID and make the first request. The session ID will be used to track all related requests.
```python showLineNumbers
import openai
import uuid
# Create a session ID
session_id = str(uuid.uuid4())
client = openai.OpenAI(
api_key="<your litellm api key>",
base_url="http://0.0.0.0:4000"
)
# First request in session
response1 = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": "Write a short story about a robot"
}
],
extra_body={
"metadata": {
"litellm_trace_id": session_id # Pass the session ID
}
}
)
```
**Request 2**
Make another request using the same session ID to link it with the previous request. This allows tracking related requests together.
```python showLineNumbers
# Second request using same session ID
response2 = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": "Now write a poem about that robot"
}
],
extra_body={
"metadata": {
"litellm_trace_id": session_id # Reuse the same session ID
}
}
)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
**Request 1**
Initialize a new session with a unique ID and create a chat model instance for making requests. The session ID is embedded in the model's configuration.
```python showLineNumbers
from langchain.chat_models import ChatOpenAI
import uuid
# Create a session ID
session_id = str(uuid.uuid4())
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
api_key="<your litellm api key>",
model="gpt-4o",
extra_body={
"metadata": {
"litellm_trace_id": session_id # Pass the session ID
}
}
)
# First request in session
response1 = chat.invoke("Write a short story about a robot")
```
**Request 2**
Use the same chat model instance to make another request, automatically maintaining the session context through the previously configured session ID.
```python showLineNumbers
# Second request using same chat object and session ID
response2 = chat.invoke("Now write a poem about that robot")
```
</TabItem>
<TabItem value="curl" label="Curl">
**Request 1**
Generate a new session ID and make the initial API call. The session ID in the metadata will be used to track this conversation.
```bash showLineNumbers
# Create a session ID
SESSION_ID=$(uuidgen)
# Store your API key
API_KEY="<your litellm api key>"
# First request in session
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer $API_KEY" \
--data '{
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": "Write a short story about a robot"
}
],
"metadata": {
"litellm_trace_id": "'$SESSION_ID'"
}
}'
```
**Request 2**
Make a follow-up request using the same session ID to maintain conversation context and tracking.
```bash showLineNumbers
# Second request using same session ID
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer $API_KEY" \
--data '{
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": "Now write a poem about that robot"
}
],
"metadata": {
"litellm_trace_id": "'$SESSION_ID'"
}
}'
```
</TabItem>
<TabItem value="litellm" label="LiteLLM Python SDK">
**Request 1**
Start a new session by creating a unique ID and making the initial request. This session ID will be used to group related requests together.
```python showLineNumbers
import litellm
import uuid
# Create a session ID
session_id = str(uuid.uuid4())
# First request in session
response1 = litellm.completion(
model="gpt-4o",
messages=[{"role": "user", "content": "Write a short story about a robot"}],
api_base="http://0.0.0.0:4000",
api_key="<your litellm api key>",
metadata={
"litellm_trace_id": session_id # Pass the session ID
}
)
```
**Request 2**
Continue the conversation by making another request with the same session ID, linking it to the previous interaction.
```python showLineNumbers
# Second request using same session ID
response2 = litellm.completion(
model="gpt-4o",
messages=[{"role": "user", "content": "Now write a poem about that robot"}],
api_base="http://0.0.0.0:4000",
api_key="<your litellm api key>",
metadata={
"litellm_trace_id": session_id # Reuse the same session ID
}
)
```
</TabItem>
</Tabs>
### `/responses`
For the `/responses` endpoint, use `previous_response_id` to group requests into a session. The `previous_response_id` is returned in the response of each request.
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
**Request 1**
Make the initial request and store the response ID for linking follow-up requests.
```python showLineNumbers
from openai import OpenAI
client = OpenAI(
api_key="<your litellm api key>",
base_url="http://0.0.0.0:4000"
)
# First request in session
response1 = client.responses.create(
model="anthropic/claude-3-sonnet-20240229-v1:0",
input="Write a short story about a robot"
)
# Store the response ID for the next request
response_id = response1.id
```
**Request 2**
Make a follow-up request using the previous response ID to maintain the conversation context.
```python showLineNumbers
# Second request using previous response ID
response2 = client.responses.create(
model="anthropic/claude-3-sonnet-20240229-v1:0",
input="Now write a poem about that robot",
previous_response_id=response_id # Link to previous request
)
```
</TabItem>
<TabItem value="curl" label="Curl">
**Request 1**
Make the initial request. The response will include an ID that can be used to link follow-up requests.
```bash showLineNumbers
# Store your API key
API_KEY="<your litellm api key>"
# First request in session
curl http://localhost:4000/v1/responses \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer $API_KEY" \
--data '{
"model": "anthropic/claude-3-sonnet-20240229-v1:0",
"input": "Write a short story about a robot"
}'
# Response will include an 'id' field that you'll use in the next request
```
**Request 2**
Make a follow-up request using the previous response ID to maintain the conversation context.
```bash showLineNumbers
# Second request using previous response ID
curl http://localhost:4000/v1/responses \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer $API_KEY" \
--data '{
"model": "anthropic/claude-3-sonnet-20240229-v1:0",
"input": "Now write a poem about that robot",
"previous_response_id": "resp_abc123..." # Replace with actual response ID from previous request
}'
```
</TabItem>
<TabItem value="litellm" label="LiteLLM Python SDK">
**Request 1**
Make the initial request and store the response ID for linking follow-up requests.
```python showLineNumbers
import litellm
# First request in session
response1 = litellm.responses(
model="anthropic/claude-3-sonnet-20240229-v1:0",
input="Write a short story about a robot",
api_base="http://0.0.0.0:4000",
api_key="<your litellm api key>"
)
# Store the response ID for the next request
response_id = response1.id
```
**Request 2**
Make a follow-up request using the previous response ID to maintain the conversation context.
```python showLineNumbers
# Second request using previous response ID
response2 = litellm.responses(
model="anthropic/claude-3-sonnet-20240229-v1:0",
input="Now write a poem about that robot",
api_base="http://0.0.0.0:4000",
api_key="<your litellm api key>",
previous_response_id=response_id # Link to previous request
)
```
</TabItem>
</Tabs>

View file

@ -23,7 +23,7 @@ Requirements:
- ** Set on config.yaml** set your master key under `general_settings:master_key`, example below - ** Set on config.yaml** set your master key under `general_settings:master_key`, example below
- ** Set env variable** set `LITELLM_MASTER_KEY` - ** Set env variable** set `LITELLM_MASTER_KEY`
(the proxy Dockerfile checks if the `DATABASE_URL` is set and then initializes the DB connection) (the proxy Dockerfile checks if the `DATABASE_URL` is set and then intializes the DB connection)
```shell ```shell
export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>
@ -333,7 +333,7 @@ curl http://localhost:4000/v1/chat/completions \
**Expected Response** **Expected Response**
Expect to see a successful response from the litellm proxy since the key passed in `X-Litellm-Key` is valid Expect to see a successfull response from the litellm proxy since the key passed in `X-Litellm-Key` is valid
```shell ```shell
{"id":"chatcmpl-f9b2b79a7c30477ab93cd0e717d1773e","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21} {"id":"chatcmpl-f9b2b79a7c30477ab93cd0e717d1773e","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}
``` ```

View file

@ -16,8 +16,6 @@ Supported Providers:
- Vertex AI (Anthropic) (`vertexai/`) - Vertex AI (Anthropic) (`vertexai/`)
- OpenRouter (`openrouter/`) - OpenRouter (`openrouter/`)
- XAI (`xai/`) - XAI (`xai/`)
- Google AI Studio (`google/`)
- Vertex AI (`vertex_ai/`)
LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message. LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.
@ -25,7 +23,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b
"message": { "message": {
... ...
"reasoning_content": "The capital of France is Paris.", "reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [ # only returned for Anthropic models "thinking_blocks": [
{ {
"type": "thinking", "type": "thinking",
"thinking": "The capital of France is Paris.", "thinking": "The capital of France is Paris.",

View file

@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http
| Fallbacks | ✅ | Works between supported models | | Fallbacks | ✅ | Works between supported models |
| Loadbalancing | ✅ | Works between supported models | | Loadbalancing | ✅ | Works between supported models |
| Supported LiteLLM Versions | 1.63.8+ | | | Supported LiteLLM Versions | 1.63.8+ | |
| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. | | Supported LLM providers | `openai` | |
## Usage ## Usage
### LiteLLM Python SDK ## Create a model response
<Tabs> <Tabs>
<TabItem value="openai" label="OpenAI"> <TabItem value="litellm-sdk" label="LiteLLM SDK">
#### Non-streaming #### Non-streaming
```python showLineNumbers title="OpenAI Non-streaming Response" ```python
import litellm import litellm
# Non-streaming response # Non-streaming response
response = litellm.responses( response = litellm.responses(
model="openai/o1-pro", model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.", input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100 max_output_tokens=100
) )
@ -38,12 +38,12 @@ print(response)
``` ```
#### Streaming #### Streaming
```python showLineNumbers title="OpenAI Streaming Response" ```python
import litellm import litellm
# Streaming response # Streaming response
response = litellm.responses( response = litellm.responses(
model="openai/o1-pro", model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.", input="Tell me a three sentence bedtime story about a unicorn.",
stream=True stream=True
) )
@ -53,204 +53,28 @@ for event in response:
``` ```
</TabItem> </TabItem>
<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
<TabItem value="anthropic" label="Anthropic">
#### Non-streaming
```python showLineNumbers title="Anthropic Non-streaming Response"
import litellm
import os
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Non-streaming response
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Anthropic Streaming Response"
import litellm
import os
# Set API key
os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"
# Streaming response
response = litellm.responses(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
#### Non-streaming
```python showLineNumbers title="Vertex AI Non-streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Non-streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Streaming Response"
import litellm
import os
# Set credentials - Vertex AI uses application default credentials
# Run 'gcloud auth application-default login' to authenticate
os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
# Streaming response
response = litellm.responses(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Non-streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Non-streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Streaming Response"
import litellm
import os
# Set AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region
# Streaming response
response = litellm.responses(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
#### Non-streaming
```python showLineNumbers title="Google AI Studio Non-streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Non-streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
max_output_tokens=100
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Streaming Response"
import litellm
import os
# Set API key for Google AI Studio
os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
# Streaming response
response = litellm.responses(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
</Tabs>
### LiteLLM Proxy with OpenAI SDK
First, set up and start your LiteLLM proxy server.
```bash title="Start LiteLLM Proxy Server"
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
<Tabs>
<TabItem value="openai" label="OpenAI">
First, add this to your litellm proxy config.yaml: First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="OpenAI Proxy Configuration" ```yaml
model_list: model_list:
- model_name: openai/o1-pro - model_name: o1-pro
litellm_params: litellm_params:
model: openai/o1-pro model: openai/o1-pro
api_key: os.environ/OPENAI_API_KEY api_key: os.environ/OPENAI_API_KEY
``` ```
Start your LiteLLM proxy:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
Then use the OpenAI SDK pointed to your proxy:
#### Non-streaming #### Non-streaming
```python showLineNumbers title="OpenAI Proxy Non-streaming Response" ```python
from openai import OpenAI from openai import OpenAI
# Initialize client with your proxy URL # Initialize client with your proxy URL
@ -261,7 +85,7 @@ client = OpenAI(
# Non-streaming response # Non-streaming response
response = client.responses.create( response = client.responses.create(
model="openai/o1-pro", model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn." input="Tell me a three sentence bedtime story about a unicorn."
) )
@ -269,7 +93,7 @@ print(response)
``` ```
#### Streaming #### Streaming
```python showLineNumbers title="OpenAI Proxy Streaming Response" ```python
from openai import OpenAI from openai import OpenAI
# Initialize client with your proxy URL # Initialize client with your proxy URL
@ -280,222 +104,7 @@ client = OpenAI(
# Streaming response # Streaming response
response = client.responses.create( response = client.responses.create(
model="openai/o1-pro", model="o1-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="anthropic" label="Anthropic">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Anthropic Proxy Configuration"
model_list:
- model_name: anthropic/claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Anthropic Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Anthropic Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="anthropic/claude-3-5-sonnet-20240620",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="vertex" label="Vertex AI">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Vertex AI Proxy Configuration"
model_list:
- model_name: vertex_ai/gemini-1.5-pro
litellm_params:
model: vertex_ai/gemini-1.5-pro
vertex_project: your-gcp-project-id
vertex_location: us-central1
```
#### Non-streaming
```python showLineNumbers title="Vertex AI Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Vertex AI Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="vertex_ai/gemini-1.5-pro",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="bedrock" label="AWS Bedrock">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="AWS Bedrock Proxy Configuration"
model_list:
- model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
litellm_params:
model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
aws_region_name: us-west-2
```
#### Non-streaming
```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="AWS Bedrock Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
input="Tell me a three sentence bedtime story about a unicorn.",
stream=True
)
for event in response:
print(event)
```
</TabItem>
<TabItem value="gemini" label="Google AI Studio">
First, add this to your litellm proxy config.yaml:
```yaml showLineNumbers title="Google AI Studio Proxy Configuration"
model_list:
- model_name: gemini/gemini-1.5-flash
litellm_params:
model: gemini/gemini-1.5-flash
api_key: os.environ/GEMINI_API_KEY
```
#### Non-streaming
```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Non-streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn."
)
print(response)
```
#### Streaming
```python showLineNumbers title="Google AI Studio Proxy Streaming Response"
from openai import OpenAI
# Initialize client with your proxy URL
client = OpenAI(
base_url="http://localhost:4000", # Your proxy URL
api_key="your-api-key" # Your proxy API key
)
# Streaming response
response = client.responses.create(
model="gemini/gemini-1.5-flash",
input="Tell me a three sentence bedtime story about a unicorn.", input="Tell me a three sentence bedtime story about a unicorn.",
stream=True stream=True
) )
@ -506,128 +115,3 @@ for event in response:
</TabItem> </TabItem>
</Tabs> </Tabs>
## Supported Responses API Parameters
| Provider | Supported Parameters |
|----------|---------------------|
| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
## Load Balancing with Routing Affinity
When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response.
#### Example Usage
<Tabs>
<TabItem value="python-sdk" label="Python SDK">
```python showLineNumbers title="Python SDK with Routing Affinity"
import litellm
# Set up router with multiple deployments of the same model
router = litellm.Router(
model_list=[
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-1",
"api_version": "2024-06-01",
"api_base": "https://endpoint1.openai.azure.com",
},
},
{
"model_name": "azure-gpt4-turbo",
"litellm_params": {
"model": "azure/gpt-4-turbo",
"api_key": "your-api-key-2",
"api_version": "2024-06-01",
"api_base": "https://endpoint2.openai.azure.com",
},
},
],
optional_pre_call_checks=["responses_api_deployment_check"],
)
# Initial request
response = await router.aresponses(
model="azure-gpt4-turbo",
input="Hello, who are you?",
truncation="auto",
)
# Store the response ID
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = await router.aresponses(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
truncation="auto",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
<TabItem value="proxy-server" label="Proxy Server">
#### 1. Setup routing affinity on proxy config.yaml
To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml.
```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity"
model_list:
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-1
api_version: 2024-06-01
api_base: https://endpoint1.openai.azure.com
- model_name: azure-gpt4-turbo
litellm_params:
model: azure/gpt-4-turbo
api_key: your-api-key-2
api_version: 2024-06-01
api_base: https://endpoint2.openai.azure.com
router_settings:
optional_pre_call_checks: ["responses_api_deployment_check"]
```
#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy
```python showLineNumbers title="OpenAI Client with Proxy Server"
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:4000",
api_key="your-api-key"
)
# Initial request
response = client.responses.create(
model="azure-gpt4-turbo",
input="Hello, who are you?"
)
response_id = response.id
# Follow-up request - will be automatically routed to the same deployment
follow_up = client.responses.create(
model="azure-gpt4-turbo",
input="Tell me more about yourself",
previous_response_id=response_id # This ensures routing to the same deployment
)
```
</TabItem>
</Tabs>

View file

@ -994,16 +994,16 @@ litellm --health
## Logging Proxy Input/Output - OpenTelemetry ## Logging Proxy Input/Output - OpenTelemetry
### Step 1 Start OpenTelemetry Collector Docker Container ### Step 1 Start OpenTelemetry Collecter Docker Container
This container sends logs to your selected destination This container sends logs to your selected destination
#### Install OpenTelemetry Collector Docker Image #### Install OpenTelemetry Collecter Docker Image
```shell ```shell
docker pull otel/opentelemetry-collector:0.90.0 docker pull otel/opentelemetry-collector:0.90.0
docker run -p 127.0.0.1:4317:4317 -p 127.0.0.1:55679:55679 otel/opentelemetry-collector:0.90.0 docker run -p 127.0.0.1:4317:4317 -p 127.0.0.1:55679:55679 otel/opentelemetry-collector:0.90.0
``` ```
#### Set Destination paths on OpenTelemetry Collector #### Set Destination paths on OpenTelemetry Collecter
Here's the OpenTelemetry yaml config to use with Elastic Search Here's the OpenTelemetry yaml config to use with Elastic Search
```yaml ```yaml
@ -1077,7 +1077,7 @@ general_settings:
LiteLLM will read the `OTEL_ENDPOINT` environment variable to send data to your OTEL collector LiteLLM will read the `OTEL_ENDPOINT` environment variable to send data to your OTEL collector
```python ```python
os.environ['OTEL_ENDPOINT'] # defaults to 127.0.0.1:4317 if not provided os.environ['OTEL_ENDPOINT'] # defauls to 127.0.0.1:4317 if not provided
``` ```
#### Start LiteLLM Proxy #### Start LiteLLM Proxy
@ -1101,8 +1101,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
``` ```
#### Test & View Logs on OpenTelemetry Collector #### Test & View Logs on OpenTelemetry Collecter
On successful logging you should be able to see this log on your `OpenTelemetry Collector` Docker Container On successfull logging you should be able to see this log on your `OpenTelemetry Collecter` Docker Container
```shell ```shell
Events: Events:
SpanEvent #0 SpanEvent #0
@ -1149,7 +1149,7 @@ Here's the log view on Elastic Search. You can see the request `input`, `output`
<Image img={require('../img/elastic_otel.png')} /> <Image img={require('../img/elastic_otel.png')} />
## Logging Proxy Input/Output - Langfuse ## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successful LLM calls to langfuse We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
**Step 1** Install langfuse **Step 1** Install langfuse

View file

@ -1,8 +1,4 @@
import Image from '@theme/IdealImage'; # Text to Speech
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# /audio/speech
## **LiteLLM Python SDK Usage** ## **LiteLLM Python SDK Usage**
### Quick Start ### Quick Start

View file

@ -117,7 +117,7 @@ response = completion("command-nightly", messages)
""" """
# questions/logs you want to run the LLM on # qustions/logs you want to run the LLM on
questions = [ questions = [
"what is litellm?", "what is litellm?",
"why should I use LiteLLM", "why should I use LiteLLM",

View file

@ -30,7 +30,7 @@ def inference(message, history):
yield partial_message yield partial_message
except Exception as e: except Exception as e:
print("Exception encountered:", str(e)) print("Exception encountered:", str(e))
yield f"An Error occurred please 'Clear' the error and try your question again" yield f"An Error occured please 'Clear' the error and try your question again"
``` ```
### Define Chat Interface ### Define Chat Interface

View file

@ -1,146 +0,0 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Using LiteLLM with OpenAI Codex
This guide walks you through connecting OpenAI Codex to LiteLLM. Using LiteLLM with Codex allows teams to:
- Access 100+ LLMs through the Codex interface
- Use powerful models like Gemini through a familiar interface
- Track spend and usage with LiteLLM's built-in analytics
- Control model access with virtual keys
<Image img={require('../../img/litellm_codex.gif')} />
## Quickstart
:::info
Requires LiteLLM v1.66.3.dev5 and higher
:::
Make sure to set up LiteLLM with the [LiteLLM Getting Started Guide](../proxy/docker_quick_start.md).
## 1. Install OpenAI Codex
Install the OpenAI Codex CLI tool globally using npm:
<Tabs>
<TabItem value="npm" label="npm">
```bash showLineNumbers
npm i -g @openai/codex
```
</TabItem>
<TabItem value="yarn" label="yarn">
```bash showLineNumbers
yarn global add @openai/codex
```
</TabItem>
</Tabs>
## 2. Start LiteLLM Proxy
<Tabs>
<TabItem value="docker" label="Docker">
```bash showLineNumbers
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml
```
</TabItem>
<TabItem value="pip" label="LiteLLM CLI">
```bash showLineNumbers
litellm --config /path/to/config.yaml
```
</TabItem>
</Tabs>
LiteLLM should now be running on [http://localhost:4000](http://localhost:4000)
## 3. Configure LiteLLM for Model Routing
Ensure your LiteLLM Proxy is properly configured to route to your desired models. Create a `litellm_config.yaml` file with the following content:
```yaml showLineNumbers
model_list:
- model_name: o3-mini
litellm_params:
model: openai/o3-mini
api_key: os.environ/OPENAI_API_KEY
- model_name: claude-3-7-sonnet-latest
litellm_params:
model: anthropic/claude-3-7-sonnet-latest
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: gemini-2.0-flash
litellm_params:
model: gemini/gemini-2.0-flash
api_key: os.environ/GEMINI_API_KEY
litellm_settings:
drop_params: true
```
This configuration enables routing to specific OpenAI, Anthropic, and Gemini models with explicit names.
## 4. Configure Codex to Use LiteLLM Proxy
Set the required environment variables to point Codex to your LiteLLM Proxy:
```bash
# Point to your LiteLLM Proxy server
export OPENAI_BASE_URL=http://0.0.0.0:4000
# Use your LiteLLM API key (if you've set up authentication)
export OPENAI_API_KEY="sk-1234"
```
## 5. Run Codex with Gemini
With everything configured, you can now run Codex with Gemini:
```bash showLineNumbers
codex --model gemini-2.0-flash --full-auto
```
<Image img={require('../../img/litellm_codex.gif')} />
The `--full-auto` flag allows Codex to automatically generate code without additional prompting.
## 6. Advanced Options
### Using Different Models
You can use any model configured in your LiteLLM proxy:
```bash
# Use Claude models
codex --model claude-3-7-sonnet-latest
# Use Google AI Studio Gemini models
codex --model gemini/gemini-2.0-flash
```
## Troubleshooting
- If you encounter connection issues, ensure your LiteLLM Proxy is running and accessible at the specified URL
- Verify your LiteLLM API key is valid if you're using authentication
- Check that your model routing configuration is correct
- For model-specific errors, ensure the model is properly configured in your LiteLLM setup
## Additional Resources
- [LiteLLM Docker Quick Start Guide](../proxy/docker_quick_start.md)
- [OpenAI Codex GitHub Repository](https://github.com/openai/codex)
- [LiteLLM Virtual Keys and Authentication](../proxy/virtual_keys.md)

View file

@ -1,74 +0,0 @@
import Image from '@theme/IdealImage';
# SCIM with LiteLLM
Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM.
This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints.
### Supported SSO Providers for SCIM
Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints.
- Microsoft Entra ID (Azure AD)
- Okta
- Google Workspace
- OneLogin
- Keycloak
- Auth0
## 1. Get your SCIM Tenant URL and Bearer Token
On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints.
<Image img={require('../../img/scim_2.png')} style={{ width: '800px', height: 'auto' }} />
## 2. Connect your IDP to LiteLLM SCIM Endpoints
On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`.
On this page, paste in your litellm scim tenant url and bearer token.
Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints.
<Image img={require('../../img/scim_4.png')} style={{ width: '800px', height: 'auto' }} />
## 3. Test SCIM Connection
### 3.1 Assign the group to your LiteLLM Enterprise App
On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app
<Image img={require('../../img/msft_enterprise_app.png')} style={{ width: '800px', height: 'auto' }} />
<br />
<br />
Once you've selected your litellm app, click on `Users and Groups` > `Add user/group`
<Image img={require('../../img/msft_enterprise_assign_group.png')} style={{ width: '800px', height: 'auto' }} />
<br />
Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
<Image img={require('../../img/msft_enterprise_select_group.png')} style={{ width: '800px', height: 'auto' }} />
### 3.2 Sign in to LiteLLM UI via SSO
Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
<Image img={require('../../img/msft_sso_sign_in.png')} style={{ width: '800px', height: 'auto' }} />
### 3.3 Check the new team on LiteLLM UI
On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM.
<Image img={require('../../img/msft_auto_team.png')} style={{ width: '900px', height: 'auto' }} />

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 268 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 999 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 235 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 244 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 173 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 380 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 261 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 413 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 807 KiB

View file

@ -1,153 +0,0 @@
---
title: v1.67.0-stable - SCIM Integration
slug: v1.67.0-stable
date: 2025-04-19T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: ["sso", "unified_file_id", "cost_tracking", "security"]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
## Key Highlights
- **SCIM Integration**: Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning
- **Team and Tag based usage tracking**: You can now see usage and spend by team and tag at 1M+ spend logs.
- **Unified Responses API**: Support for calling Anthropic, Gemini, Groq, etc. via OpenAI's new Responses API.
Let's dive in.
## SCIM Integration
<Image img={require('../../img/scim_integration.png')}/>
This release adds SCIM support to LiteLLM. This allows your SSO provider (Okta, Azure AD, etc) to automatically create/delete users, teams, and memberships on LiteLLM. This means that when you remove a team on your SSO provider, your SSO provider will automatically delete the corresponding team on LiteLLM.
[Read more](../../docs/tutorials/scim_litellm)
## Team and Tag based usage tracking
<Image img={require('../../img/release_notes/new_team_usage_highlight.jpg')}/>
This release improves team and tag based usage tracking at 1m+ spend logs, making it easy to monitor your LLM API Spend in production. This covers:
- View **daily spend** by teams + tags
- View **usage / spend by key**, within teams
- View **spend by multiple tags**
- Allow **internal users** to view spend of teams they're a member of
[Read more](#management-endpoints--ui)
## Unified Responses API
This release allows you to call Azure OpenAI, Anthropic, AWS Bedrock, and Google Vertex AI models via the POST /v1/responses endpoint on LiteLLM. This means you can now use popular tools like [OpenAI Codex](https://docs.litellm.ai/docs/tutorials/openai_codex) with your own models.
<Image img={require('../../img/release_notes/unified_responses_api_rn.png')}/>
[Read more](https://docs.litellm.ai/docs/response_api)
## New Models / Updated Models
- **OpenAI**
1. gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing - [Get Started](../../docs/providers/openai#usage), [PR](https://github.com/BerriAI/litellm/pull/9990)
2. o4 - correctly map o4 to openai o_series model
- **Azure AI**
1. Phi-4 output cost per token fix - [PR](https://github.com/BerriAI/litellm/pull/9880)
2. Responses API support [Get Started](../../docs/providers/azure#azure-responses-api),[PR](https://github.com/BerriAI/litellm/pull/10116)
- **Anthropic**
1. redacted message thinking support - [Get Started](../../docs/providers/anthropic#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10129)
- **Cohere**
1. `/v2/chat` Passthrough endpoint support w/ cost tracking - [Get Started](../../docs/pass_through/cohere), [PR](https://github.com/BerriAI/litellm/pull/9997)
- **Azure**
1. Support azure tenant_id/client_id env vars - [Get Started](../../docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret), [PR](https://github.com/BerriAI/litellm/pull/9993)
2. Fix response_format check for 2025+ api versions - [PR](https://github.com/BerriAI/litellm/pull/9993)
3. Add gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing
- **VLLM**
1. Files - Support 'file' message type for VLLM video url's - [Get Started](../../docs/providers/vllm#send-video-url-to-vllm), [PR](https://github.com/BerriAI/litellm/pull/10129)
2. Passthrough - new `/vllm/` passthrough endpoint support [Get Started](../../docs/pass_through/vllm), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **Mistral**
1. new `/mistral` passthrough endpoint support [Get Started](../../docs/pass_through/mistral), [PR](https://github.com/BerriAI/litellm/pull/10002)
- **AWS**
1. New mapped bedrock regions - [PR](https://github.com/BerriAI/litellm/pull/9430)
- **VertexAI / Google AI Studio**
1. Gemini - Response format - Retain schema field ordering for google gemini and vertex by specifying propertyOrdering - [Get Started](../../docs/providers/vertex#json-schema), [PR](https://github.com/BerriAI/litellm/pull/9828)
2. Gemini-2.5-flash - return reasoning content [Google AI Studio](../../docs/providers/gemini#usage---thinking--reasoning_content), [Vertex AI](../../docs/providers/vertex#thinking--reasoning_content)
3. Gemini-2.5-flash - pricing + model information [PR](https://github.com/BerriAI/litellm/pull/10125)
4. Passthrough - new `/vertex_ai/discovery` route - enables calling AgentBuilder API routes [Get Started](../../docs/pass_through/vertex_ai#supported-api-endpoints), [PR](https://github.com/BerriAI/litellm/pull/10084)
- **Fireworks AI**
1. return tool calling responses in `tool_calls` field (fireworks incorrectly returns this as a json str in content) [PR](https://github.com/BerriAI/litellm/pull/10130)
- **Triton**
1. Remove fixed remove bad_words / stop words from `/generate` call - [Get Started](../../docs/providers/triton-inference-server#triton-generate---chat-completion), [PR](https://github.com/BerriAI/litellm/pull/10163)
- **Other**
1. Support for all litellm providers on Responses API (works with Codex) - [Get Started](../../docs/tutorials/openai_codex), [PR](https://github.com/BerriAI/litellm/pull/10132)
2. Fix combining multiple tool calls in streaming response - [Get Started](../../docs/completion/stream#helper-function), [PR](https://github.com/BerriAI/litellm/pull/10040)
## Spend Tracking Improvements
- **Cost Control** - inject cache control points in prompt for cost reduction [Get Started](../../docs/tutorials/prompt_caching), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Spend Tags** - spend tags in headers - support x-litellm-tags even if tag based routing not enabled [Get Started](../../docs/proxy/request_headers#litellm-headers), [PR](https://github.com/BerriAI/litellm/pull/10000)
- **Gemini-2.5-flash** - support cost calculation for reasoning tokens [PR](https://github.com/BerriAI/litellm/pull/10141)
## Management Endpoints / UI
- **Users**
1. Show created_at and updated_at on users page - [PR](https://github.com/BerriAI/litellm/pull/10033)
- **Virtual Keys**
1. Filter by key alias - https://github.com/BerriAI/litellm/pull/10085
- **Usage Tab**
1. Team based usage
- New `LiteLLM_DailyTeamSpend` Table for aggregate team based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10039)
- New Team based usage dashboard + new `/team/daily/activity` API - [PR](https://github.com/BerriAI/litellm/pull/10081)
- Return team alias on /team/daily/activity API - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow internal user view spend for teams they belong to - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by team - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_team_usage.png')}/>
2. Tag Based Usage
- New `LiteLLM_DailyTagSpend` Table for aggregate tag based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10071)
- Restrict to only Proxy Admins - [PR](https://github.com/BerriAI/litellm/pull/10157)
- allow viewing top keys by tag
- Return tags passed in request (i.e. dynamic tags) on `/tag/list` API - [PR](https://github.com/BerriAI/litellm/pull/10157)
<Image img={require('../../img/release_notes/new_tag_usage.png')}/>
3. Track prompt caching metrics in daily user, team, tag tables - [PR](https://github.com/BerriAI/litellm/pull/10029)
4. Show usage by key (on all up, team, and tag usage dashboards) - [PR](https://github.com/BerriAI/litellm/pull/10157)
5. swap old usage with new usage tab
- **Models**
1. Make columns resizable/hideable - [PR](https://github.com/BerriAI/litellm/pull/10119)
- **API Playground**
1. Allow internal user to call api playground - [PR](https://github.com/BerriAI/litellm/pull/10157)
- **SCIM**
1. Add LiteLLM SCIM Integration for Team and User management - [Get Started](../../docs/tutorials/scim_litellm), [PR](https://github.com/BerriAI/litellm/pull/10072)
## Logging / Guardrail Integrations
- **GCS**
1. Fix gcs pub sub logging with env var GCS_PROJECT_ID - [Get Started](../../docs/observability/gcs_bucket_integration#usage), [PR](https://github.com/BerriAI/litellm/pull/10042)
- **AIM**
1. Add litellm call id passing to Aim guardrails on pre and post-hooks calls - [Get Started](../../docs/proxy/guardrails/aim_security), [PR](https://github.com/BerriAI/litellm/pull/10021)
- **Azure blob storage**
1. Ensure logging works in high throughput scenarios - [Get Started](../../docs/proxy/logging#azure-blob-storage), [PR](https://github.com/BerriAI/litellm/pull/9962)
## General Proxy Improvements
- **Support setting `litellm.modify_params` via env var** [PR](https://github.com/BerriAI/litellm/pull/9964)
- **Model Discovery** - Check providers `/models` endpoints when calling proxys `/v1/models` endpoint - [Get Started](../../docs/proxy/model_discovery), [PR](https://github.com/BerriAI/litellm/pull/9958)
- **`/utils/token_counter`** - fix retrieving custom tokenizer for db models - [Get Started](../../docs/proxy/configs#set-custom-tokenizer), [PR](https://github.com/BerriAI/litellm/pull/10047)
- **Prisma migrate** - handle existing columns in db table - [PR](https://github.com/BerriAI/litellm/pull/10138)

View file

@ -1,137 +0,0 @@
---
title: v1.67.4-stable
slug: v1.67.4-stable
date: 2025-04-26T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
tags: ["responses_api", "ui_improvements", "security", "session_management"]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
## Key Highlights
Let's dive in.
## New Models / Updated Models
- **OpenAI**
1. Added `gpt-image-1` cost tracking [Get Started](https://docs.litellm.ai/docs/image_generation)
2. Bug fix: added cost tracking for gpt-image-1 when quality is unspecified [PR](https://github.com/BerriAI/litellm/pull/10247)
- **Azure**
1. Fixed timestamp granularities passing to whisper in Azure [Get Started](https://docs.litellm.ai/docs/audio_transcription)
2. Added azure/gpt-image-1 pricing [Get Started](https://docs.litellm.ai/docs/image_generation), [PR](https://github.com/BerriAI/litellm/pull/10327)
3. Added cost tracking for `azure/computer-use-preview`, `azure/gpt-4o-audio-preview-2024-12-17`, `azure/gpt-4o-mini-audio-preview-2024-12-17` [PR](https://github.com/BerriAI/litellm/pull/10178)
- **Bedrock**
1. Added support for all compatible Bedrock parameters when model="arn:.." (Bedrock application inference profile models) [Get started](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile), [PR](https://github.com/BerriAI/litellm/pull/10256)
2. Fixed wrong system prompt transformation [PR](https://github.com/BerriAI/litellm/pull/10120)
- **VertexAI / Google AI Studio**
1. Allow setting `budget_tokens=0` for `gemini-2.5-flash` [Get Started](https://docs.litellm.ai/docs/providers/gemini#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10198)
2. Ensure returned `usage` includes thinking token usage [PR](https://github.com/BerriAI/litellm/pull/10198)
3. Added cost tracking for `gemini-2.5-pro-preview-03-25` [PR](https://github.com/BerriAI/litellm/pull/10178)
- **Cohere**
1. Added support for cohere command-a-03-2025 [Get Started](https://docs.litellm.ai/docs/providers/cohere), [PR](https://github.com/BerriAI/litellm/pull/10295)
- **SageMaker**
1. Added support for max_completion_tokens parameter [Get Started](https://docs.litellm.ai/docs/providers/sagemaker), [PR](https://github.com/BerriAI/litellm/pull/10300)
- **Responses API**
1. Added support for GET and DELETE operations - `/v1/responses/{response_id}` [Get Started](../../docs/response_api)
2. Added session management support for non-OpenAI models [PR](https://github.com/BerriAI/litellm/pull/10321)
3. Added routing affinity to maintain model consistency within sessions [Get Started](https://docs.litellm.ai/docs/response_api#load-balancing-with-routing-affinity), [PR](https://github.com/BerriAI/litellm/pull/10193)
## Spend Tracking Improvements
- **Bug Fix**: Fixed spend tracking bug, ensuring default litellm params aren't modified in memory [PR](https://github.com/BerriAI/litellm/pull/10167)
- **Deprecation Dates**: Added deprecation dates for Azure, VertexAI models [PR](https://github.com/BerriAI/litellm/pull/10308)
## Management Endpoints / UI
#### Users
- **Filtering and Searching**:
- Filter users by user_id, role, team, sso_id
- Search users by email
<br/>
<Image img={require('../../img/release_notes/user_filters.png')}/>
- **User Info Panel**: Added a new user information pane [PR](https://github.com/BerriAI/litellm/pull/10213)
#### Teams
- **Filtering and Searching**:
- Filter teams by Organization, Team ID [PR](https://github.com/BerriAI/litellm/pull/10324)
- Search teams by Team Name [PR](https://github.com/BerriAI/litellm/pull/10324)
<br/>
<Image img={require('../../img/release_notes/team_filters.png')}/>
#### Keys
- **Key Management**:
- Support for cross-filtering and filtering by key hash [PR](https://github.com/BerriAI/litellm/pull/10322)
- Fixed key alias reset when resetting filters [PR](https://github.com/BerriAI/litellm/pull/10099)
- Fixed table rendering on key creation [PR](https://github.com/BerriAI/litellm/pull/10224)
#### UI Logs Page
- **Session Logs**: Added UI Session Logs [Get Started](https://docs.litellm.ai/docs/proxy/ui_logs_sessions)
#### UI Authentication & Security
- **Required Authentication**: Authentication now required for all dashboard pages [PR](https://github.com/BerriAI/litellm/pull/10229)
- **SSO Fixes**: Fixed SSO user login invalid token error [PR](https://github.com/BerriAI/litellm/pull/10298)
- **Encrypted Tokens**: Moved UI to encrypted token usage [PR](https://github.com/BerriAI/litellm/pull/10302)
- **Token Expiry**: Added token expiry logic to user dashboard [PR](https://github.com/BerriAI/litellm/pull/10250)
#### UI General fixes
- **Fixed UI Flicker**: Addressed UI flickering issues in Dashboard [PR](https://github.com/BerriAI/litellm/pull/10261)
- **Improved Terminology**: Better loading and no-data states on Keys and Tools pages [PR](https://github.com/BerriAI/litellm/pull/10253)
- **Azure Model Support**: Fixed editing Azure public model names and changing model names after creation [PR](https://github.com/BerriAI/litellm/pull/10249)
- **Team Model Selector**: Bug fix for team model selection [PR](https://github.com/BerriAI/litellm/pull/10171)
## Logging / Guardrail Integrations
- **Datadog**:
1. Fixed Datadog LLM observability logging [Get Started](https://docs.litellm.ai/docs/proxy/logging#datadog), [PR](https://github.com/BerriAI/litellm/pull/10206)
- **Prometheus / Grafana**:
1. Enable datasource selection on LiteLLM Grafana Template [Get Started](https://docs.litellm.ai/docs/proxy/prometheus#-litellm-maintained-grafana-dashboards-), [PR](https://github.com/BerriAI/litellm/pull/10257)
- **AgentOps**:
1. Added AgentOps Integration [Get Started](https://docs.litellm.ai/docs/observability/agentops_integration), [PR](https://github.com/BerriAI/litellm/pull/9685)
- **Arize**:
1. Added missing attributes for Arize & Phoenix Integration [Get Started](https://docs.litellm.ai/docs/observability/arize_integration), [PR](https://github.com/BerriAI/litellm/pull/10215)
## General Proxy Improvements
- **Caching**: Fixed caching to account for thinking or reasoning_effort config [PR](https://github.com/BerriAI/litellm/pull/10140)
- **Model Groups**: Fixed handling for cases where user sets model_group inside model_info [PR](https://github.com/BerriAI/litellm/pull/10191)
- **Passthrough Endpoints**: Ensured `PassthroughStandardLoggingPayload` is logged with method, URL, request/response body [PR](https://github.com/BerriAI/litellm/pull/10194)
- **Fix SQL Injection**: Fixed potential SQL injection vulnerability in spend_management_endpoints.py [PR](https://github.com/BerriAI/litellm/pull/9878)
## Helm
- Fixed serviceAccountName on migration job [PR](https://github.com/BerriAI/litellm/pull/10258)
## Full Changelog
The complete list of changes can be found in the [GitHub release notes](https://github.com/BerriAI/litellm/compare/v1.67.0-stable...v1.67.4-stable).

View file

@ -69,7 +69,6 @@ const sidebars = {
"proxy/clientside_auth", "proxy/clientside_auth",
"proxy/request_headers", "proxy/request_headers",
"proxy/response_headers", "proxy/response_headers",
"proxy/model_discovery",
], ],
}, },
{ {
@ -102,17 +101,9 @@ const sidebars = {
"proxy/admin_ui_sso", "proxy/admin_ui_sso",
"proxy/self_serve", "proxy/self_serve",
"proxy/public_teams", "proxy/public_teams",
"tutorials/scim_litellm",
"proxy/custom_sso", "proxy/custom_sso",
"proxy/ui_credentials", "proxy/ui_credentials",
{ "proxy/ui_logs"
type: "category",
label: "UI Logs",
items: [
"proxy/ui_logs",
"proxy/ui_logs_sessions"
]
}
], ],
}, },
{ {
@ -339,8 +330,6 @@ const sidebars = {
"pass_through/vertex_ai", "pass_through/vertex_ai",
"pass_through/google_ai_studio", "pass_through/google_ai_studio",
"pass_through/cohere", "pass_through/cohere",
"pass_through/vllm",
"pass_through/mistral",
"pass_through/openai_passthrough", "pass_through/openai_passthrough",
"pass_through/anthropic_completion", "pass_through/anthropic_completion",
"pass_through/bedrock", "pass_through/bedrock",
@ -418,7 +407,6 @@ const sidebars = {
type: "category", type: "category",
label: "Logging & Observability", label: "Logging & Observability",
items: [ items: [
"observability/agentops_integration",
"observability/langfuse_integration", "observability/langfuse_integration",
"observability/lunary_integration", "observability/lunary_integration",
"observability/mlflow", "observability/mlflow",
@ -455,7 +443,6 @@ const sidebars = {
label: "Tutorials", label: "Tutorials",
items: [ items: [
"tutorials/openweb_ui", "tutorials/openweb_ui",
"tutorials/openai_codex",
"tutorials/msft_sso", "tutorials/msft_sso",
"tutorials/prompt_caching", "tutorials/prompt_caching",
"tutorials/tag_management", "tutorials/tag_management",

View file

@ -1,136 +0,0 @@
from litellm.proxy._types import SpendLogsPayload
from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_proxy_logger
from typing import Optional, List, Union
import json
from litellm.types.utils import ModelResponse, Message
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionResponseMessage,
GenericChatCompletionMessage,
ResponseInputParam,
)
from litellm.types.utils import ChatCompletionMessageToolCall
from litellm.responses.utils import ResponsesAPIRequestUtils
from typing import TypedDict
class ChatCompletionSession(TypedDict, total=False):
messages: List[Union[AllMessageValues, GenericChatCompletionMessage, ChatCompletionMessageToolCall, ChatCompletionResponseMessage, Message]]
litellm_session_id: Optional[str]
class _ENTERPRISE_ResponsesSessionHandler:
@staticmethod
async def get_chat_completion_message_history_for_previous_response_id(
previous_response_id: str,
) -> ChatCompletionSession:
"""
Return the chat completion message history for a previous response id
"""
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
all_spend_logs: List[SpendLogsPayload] = await _ENTERPRISE_ResponsesSessionHandler.get_all_spend_logs_for_previous_response_id(previous_response_id)
litellm_session_id: Optional[str] = None
if len(all_spend_logs) > 0:
litellm_session_id = all_spend_logs[0].get("session_id")
chat_completion_message_history: List[
Union[
AllMessageValues,
GenericChatCompletionMessage,
ChatCompletionMessageToolCall,
ChatCompletionResponseMessage,
Message,
]
] = []
for spend_log in all_spend_logs:
proxy_server_request: Union[str, dict] = spend_log.get("proxy_server_request") or "{}"
proxy_server_request_dict: Optional[dict] = None
response_input_param: Optional[Union[str, ResponseInputParam]] = None
if isinstance(proxy_server_request, dict):
proxy_server_request_dict = proxy_server_request
else:
proxy_server_request_dict = json.loads(proxy_server_request)
############################################################
# Add Input messages for this Spend Log
############################################################
if proxy_server_request_dict:
_response_input_param = proxy_server_request_dict.get("input", None)
if isinstance(_response_input_param, str):
response_input_param = _response_input_param
elif isinstance(_response_input_param, dict):
response_input_param = ResponseInputParam(**_response_input_param)
if response_input_param:
chat_completion_messages = LiteLLMCompletionResponsesConfig.transform_responses_api_input_to_messages(
input=response_input_param,
responses_api_request=proxy_server_request_dict or {}
)
chat_completion_message_history.extend(chat_completion_messages)
############################################################
# Add Output messages for this Spend Log
############################################################
_response_output = spend_log.get("response", "{}")
if isinstance(_response_output, dict):
# transform `ChatCompletion Response` to `ResponsesAPIResponse`
model_response = ModelResponse(**_response_output)
for choice in model_response.choices:
if hasattr(choice, "message"):
chat_completion_message_history.append(choice.message)
verbose_proxy_logger.debug("chat_completion_message_history %s", json.dumps(chat_completion_message_history, indent=4, default=str))
return ChatCompletionSession(
messages=chat_completion_message_history,
litellm_session_id=litellm_session_id
)
@staticmethod
async def get_all_spend_logs_for_previous_response_id(
previous_response_id: str
) -> List[SpendLogsPayload]:
"""
Get all spend logs for a previous response id
SQL query
SELECT session_id FROM spend_logs WHERE response_id = previous_response_id, SELECT * FROM spend_logs WHERE session_id = session_id
"""
from litellm.proxy.proxy_server import prisma_client
decoded_response_id = ResponsesAPIRequestUtils._decode_responses_api_response_id(previous_response_id)
previous_response_id = decoded_response_id.get("response_id", previous_response_id)
if prisma_client is None:
return []
query = """
WITH matching_session AS (
SELECT session_id
FROM "LiteLLM_SpendLogs"
WHERE request_id = $1
)
SELECT *
FROM "LiteLLM_SpendLogs"
WHERE session_id IN (SELECT session_id FROM matching_session)
ORDER BY "endTime" ASC;
"""
spend_logs = await prisma_client.db.query_raw(
query,
previous_response_id
)
verbose_proxy_logger.debug(
"Found the following spend logs for previous response id %s: %s",
previous_response_id,
json.dumps(spend_logs, indent=4, default=str)
)
return spend_logs

View file

@ -1,4 +0,0 @@
-- AlterTable
ALTER TABLE "LiteLLM_SpendLogs" ADD COLUMN "proxy_server_request" JSONB DEFAULT '{}',
ADD COLUMN "session_id" TEXT;

View file

@ -226,8 +226,6 @@ model LiteLLM_SpendLogs {
requester_ip_address String? requester_ip_address String?
messages Json? @default("{}") messages Json? @default("{}")
response Json? @default("{}") response Json? @default("{}")
session_id String?
proxy_server_request Json? @default("{}")
@@index([startTime]) @@index([startTime])
@@index([end_user]) @@index([end_user])
} }

View file

@ -1,11 +1,8 @@
import glob import glob
import os import os
import random import random
import re
import shutil
import subprocess import subprocess
import time import time
from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -21,30 +18,9 @@ def str_to_bool(value: Optional[str]) -> bool:
class ProxyExtrasDBManager: class ProxyExtrasDBManager:
@staticmethod @staticmethod
def _get_prisma_dir() -> str: def _get_prisma_dir() -> str:
""" """Get the path to the migrations directory"""
Get the path to the migrations directory migrations_dir = os.path.dirname(__file__)
return migrations_dir
Set os.environ["LITELLM_MIGRATION_DIR"] to a custom migrations directory, to support baselining db in read-only fs.
"""
custom_migrations_dir = os.getenv("LITELLM_MIGRATION_DIR")
pkg_migrations_dir = os.path.dirname(__file__)
if custom_migrations_dir:
# If migrations_dir exists, copy contents
if os.path.exists(custom_migrations_dir):
# Copy contents instead of directory itself
for item in os.listdir(pkg_migrations_dir):
src_path = os.path.join(pkg_migrations_dir, item)
dst_path = os.path.join(custom_migrations_dir, item)
if os.path.isdir(src_path):
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
else:
shutil.copy2(src_path, dst_path)
else:
# If directory doesn't exist, create it and copy everything
shutil.copytree(pkg_migrations_dir, custom_migrations_dir)
return custom_migrations_dir
return pkg_migrations_dir
@staticmethod @staticmethod
def _create_baseline_migration(schema_path: str) -> bool: def _create_baseline_migration(schema_path: str) -> bool:
@ -56,29 +32,27 @@ class ProxyExtrasDBManager:
# Create migrations/0_init directory # Create migrations/0_init directory
init_dir.mkdir(parents=True, exist_ok=True) init_dir.mkdir(parents=True, exist_ok=True)
database_url = os.getenv("DATABASE_URL") # Generate migration SQL file
migration_file = init_dir / "migration.sql"
try: try:
# 1. Generate migration SQL file by comparing empty state to current db state # Generate migration diff with increased timeout
logger.info("Generating baseline migration...")
migration_file = init_dir / "migration.sql"
subprocess.run( subprocess.run(
[ [
"prisma", "prisma",
"migrate", "migrate",
"diff", "diff",
"--from-empty", "--from-empty",
"--to-url", "--to-schema-datamodel",
database_url, str(schema_path),
"--script", "--script",
], ],
stdout=open(migration_file, "w"), stdout=open(migration_file, "w"),
check=True, check=True,
timeout=30, timeout=30,
) ) # 30 second timeout
# 3. Mark the migration as applied since it represents current state # Mark migration as applied with increased timeout
logger.info("Marking baseline migration as applied...")
subprocess.run( subprocess.run(
[ [
"prisma", "prisma",
@ -98,10 +72,8 @@ class ProxyExtrasDBManager:
) )
return False return False
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.warning( logger.warning(f"Error creating baseline migration: {e}")
f"Error creating baseline migration: {e}, {e.stderr}, {e.stdout}" return False
)
raise e
@staticmethod @staticmethod
def _get_migration_names(migrations_dir: str) -> list: def _get_migration_names(migrations_dir: str) -> list:
@ -111,105 +83,8 @@ class ProxyExtrasDBManager:
return [Path(p).parent.name for p in migration_paths] return [Path(p).parent.name for p in migration_paths]
@staticmethod @staticmethod
def _roll_back_migration(migration_name: str): def _resolve_all_migrations(migrations_dir: str):
"""Mark a specific migration as rolled back""" """Mark all existing migrations as applied"""
subprocess.run(
["prisma", "migrate", "resolve", "--rolled-back", migration_name],
timeout=60,
check=True,
capture_output=True,
)
@staticmethod
def _resolve_specific_migration(migration_name: str):
"""Mark a specific migration as applied"""
subprocess.run(
["prisma", "migrate", "resolve", "--applied", migration_name],
timeout=60,
check=True,
capture_output=True,
)
@staticmethod
def _resolve_all_migrations(migrations_dir: str, schema_path: str):
"""
1. Compare the current database state to schema.prisma and generate a migration for the diff.
2. Run prisma migrate deploy to apply any pending migrations.
3. Mark all existing migrations as applied.
"""
database_url = os.getenv("DATABASE_URL")
diff_dir = (
Path(migrations_dir)
/ "migrations"
/ f"{datetime.now().strftime('%Y%m%d%H%M%S')}_baseline_diff"
)
try:
diff_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
if "Permission denied" in str(e):
logger.warning(
f"Permission denied - {e}\nunable to baseline db. Set LITELLM_MIGRATION_DIR environment variable to a writable directory to enable migrations."
)
return
raise e
diff_sql_path = diff_dir / "migration.sql"
# 1. Generate migration SQL for the diff between DB and schema
try:
logger.info("Generating migration diff between DB and schema.prisma...")
with open(diff_sql_path, "w") as f:
subprocess.run(
[
"prisma",
"migrate",
"diff",
"--from-url",
database_url,
"--to-schema-datamodel",
schema_path,
"--script",
],
check=True,
timeout=60,
stdout=f,
)
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to generate migration diff: {e.stderr}")
except subprocess.TimeoutExpired:
logger.warning("Migration diff generation timed out.")
# check if the migration was created
if not diff_sql_path.exists():
logger.warning("Migration diff was not created")
return
logger.info(f"Migration diff created at {diff_sql_path}")
# 2. Run prisma db execute to apply the migration
try:
logger.info("Running prisma db execute to apply the migration diff...")
result = subprocess.run(
[
"prisma",
"db",
"execute",
"--file",
str(diff_sql_path),
"--schema",
schema_path,
],
timeout=60,
check=True,
capture_output=True,
text=True,
)
logger.info(f"prisma db execute stdout: {result.stdout}")
logger.info("✅ Migration diff applied successfully")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to apply migration diff: {e.stderr}")
except subprocess.TimeoutExpired:
logger.warning("Migration diff application timed out.")
# 3. Mark all migrations as applied
migration_names = ProxyExtrasDBManager._get_migration_names(migrations_dir) migration_names = ProxyExtrasDBManager._get_migration_names(migrations_dir)
logger.info(f"Resolving {len(migration_names)} migrations") logger.info(f"Resolving {len(migration_names)} migrations")
for migration_name in migration_names: for migration_name in migration_names:
@ -230,7 +105,7 @@ class ProxyExtrasDBManager:
) )
@staticmethod @staticmethod
def setup_database(use_migrate: bool = False) -> bool: def setup_database(schema_path: str, use_migrate: bool = False) -> bool:
""" """
Set up the database using either prisma migrate or prisma db push Set up the database using either prisma migrate or prisma db push
Uses migrations from litellm-proxy-extras package Uses migrations from litellm-proxy-extras package
@ -242,7 +117,6 @@ class ProxyExtrasDBManager:
Returns: Returns:
bool: True if setup was successful, False otherwise bool: True if setup was successful, False otherwise
""" """
schema_path = ProxyExtrasDBManager._get_prisma_dir() + "/schema.prisma"
use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate
for attempt in range(4): for attempt in range(4):
original_dir = os.getcwd() original_dir = os.getcwd()
@ -267,34 +141,7 @@ class ProxyExtrasDBManager:
return True return True
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.info(f"prisma db error: {e.stderr}, e: {e.stdout}") logger.info(f"prisma db error: {e.stderr}, e: {e.stdout}")
if "P3009" in e.stderr: if (
# Extract the failed migration name from the error message
migration_match = re.search(
r"`(\d+_.*)` migration", e.stderr
)
if migration_match:
failed_migration = migration_match.group(1)
logger.info(
f"Found failed migration: {failed_migration}, marking as rolled back"
)
# Mark the failed migration as rolled back
subprocess.run(
[
"prisma",
"migrate",
"resolve",
"--rolled-back",
failed_migration,
],
timeout=60,
check=True,
capture_output=True,
text=True,
)
logger.info(
f"✅ Migration {failed_migration} marked as rolled back... retrying"
)
elif (
"P3005" in e.stderr "P3005" in e.stderr
and "database schema is not empty" in e.stderr and "database schema is not empty" in e.stderr
): ):
@ -305,34 +152,9 @@ class ProxyExtrasDBManager:
logger.info( logger.info(
"Baseline migration created, resolving all migrations" "Baseline migration created, resolving all migrations"
) )
ProxyExtrasDBManager._resolve_all_migrations( ProxyExtrasDBManager._resolve_all_migrations(migrations_dir)
migrations_dir, schema_path
)
logger.info("✅ All migrations resolved.") logger.info("✅ All migrations resolved.")
return True return True
elif (
"P3018" in e.stderr
): # PostgreSQL error code for duplicate column
logger.info(
"Migration already exists, resolving specific migration"
)
# Extract the migration name from the error message
migration_match = re.search(
r"Migration name: (\d+_.*)", e.stderr
)
if migration_match:
migration_name = migration_match.group(1)
logger.info(f"Rolling back migration {migration_name}")
ProxyExtrasDBManager._roll_back_migration(
migration_name
)
logger.info(
f"Resolving migration {migration_name} that failed due to existing columns"
)
ProxyExtrasDBManager._resolve_specific_migration(
migration_name
)
logger.info("✅ Migration resolved.")
else: else:
# Use prisma db push with increased timeout # Use prisma db push with increased timeout
subprocess.run( subprocess.run(

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm-proxy-extras" name = "litellm-proxy-extras"
version = "0.1.12" version = "0.1.10"
description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package." description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
authors = ["BerriAI"] authors = ["BerriAI"]
readme = "README.md" readme = "README.md"
@ -22,7 +22,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "0.1.12" version = "0.1.10"
version_files = [ version_files = [
"pyproject.toml:version", "pyproject.toml:version",
"../requirements.txt:litellm-proxy-extras==", "../requirements.txt:litellm-proxy-extras==",

View file

@ -113,7 +113,6 @@ _custom_logger_compatible_callbacks_literal = Literal[
"pagerduty", "pagerduty",
"humanloop", "humanloop",
"gcs_pubsub", "gcs_pubsub",
"agentops",
"anthropic_cache_control_hook", "anthropic_cache_control_hook",
] ]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
@ -129,19 +128,19 @@ prometheus_initialize_budget_metrics: Optional[bool] = False
require_auth_for_metrics_endpoint: Optional[bool] = False require_auth_for_metrics_endpoint: Optional[bool] = False
argilla_batch_size: Optional[int] = None argilla_batch_size: Optional[int] = None
datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload
gcs_pub_sub_use_v1: Optional[bool] = ( gcs_pub_sub_use_v1: Optional[
False # if you want to use v1 gcs pubsub logged payload bool
) ] = False # if you want to use v1 gcs pubsub logged payload
argilla_transformation_object: Optional[Dict[str, Any]] = None argilla_transformation_object: Optional[Dict[str, Any]] = None
_async_input_callback: List[Union[str, Callable, CustomLogger]] = ( _async_input_callback: List[
[] Union[str, Callable, CustomLogger]
) # internal variable - async custom callbacks are routed here. ] = [] # internal variable - async custom callbacks are routed here.
_async_success_callback: List[Union[str, Callable, CustomLogger]] = ( _async_success_callback: List[
[] Union[str, Callable, CustomLogger]
) # internal variable - async custom callbacks are routed here. ] = [] # internal variable - async custom callbacks are routed here.
_async_failure_callback: List[Union[str, Callable, CustomLogger]] = ( _async_failure_callback: List[
[] Union[str, Callable, CustomLogger]
) # internal variable - async custom callbacks are routed here. ] = [] # internal variable - async custom callbacks are routed here.
pre_call_rules: List[Callable] = [] pre_call_rules: List[Callable] = []
post_call_rules: List[Callable] = [] post_call_rules: List[Callable] = []
turn_off_message_logging: Optional[bool] = False turn_off_message_logging: Optional[bool] = False
@ -149,18 +148,18 @@ log_raw_request_response: bool = False
redact_messages_in_exceptions: Optional[bool] = False redact_messages_in_exceptions: Optional[bool] = False
redact_user_api_key_info: Optional[bool] = False redact_user_api_key_info: Optional[bool] = False
filter_invalid_headers: Optional[bool] = False filter_invalid_headers: Optional[bool] = False
add_user_information_to_llm_headers: Optional[bool] = ( add_user_information_to_llm_headers: Optional[
None # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers bool
) ] = None # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
store_audit_logs = False # Enterprise feature, allow users to see audit logs store_audit_logs = False # Enterprise feature, allow users to see audit logs
### end of callbacks ############# ### end of callbacks #############
email: Optional[str] = ( email: Optional[
None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 str
) ] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
token: Optional[str] = ( token: Optional[
None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 str
) ] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
telemetry = True telemetry = True
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False)) drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
@ -236,24 +235,20 @@ enable_loadbalancing_on_batch_endpoints: Optional[bool] = None
enable_caching_on_provider_specific_optional_params: bool = ( enable_caching_on_provider_specific_optional_params: bool = (
False # feature-flag for caching on optional params - e.g. 'top_k' False # feature-flag for caching on optional params - e.g. 'top_k'
) )
caching: bool = ( caching: bool = False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 caching_with_models: bool = False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
) cache: Optional[
caching_with_models: bool = ( Cache
False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 ] = None # cache object <- use this - https://docs.litellm.ai/docs/caching
)
cache: Optional[Cache] = (
None # cache object <- use this - https://docs.litellm.ai/docs/caching
)
default_in_memory_ttl: Optional[float] = None default_in_memory_ttl: Optional[float] = None
default_redis_ttl: Optional[float] = None default_redis_ttl: Optional[float] = None
default_redis_batch_cache_expiry: Optional[float] = None default_redis_batch_cache_expiry: Optional[float] = None
model_alias_map: Dict[str, str] = {} model_alias_map: Dict[str, str] = {}
model_group_alias_map: Dict[str, str] = {} model_group_alias_map: Dict[str, str] = {}
max_budget: float = 0.0 # set the max budget across all providers max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[str] = ( budget_duration: Optional[
None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). str
) ] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
default_soft_budget: float = ( default_soft_budget: float = (
DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0 DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
) )
@ -262,15 +257,11 @@ forward_traceparent_to_llm_provider: bool = False
_current_cost = 0.0 # private variable, used if max budget is set _current_cost = 0.0 # private variable, used if max budget is set
error_logs: Dict = {} error_logs: Dict = {}
add_function_to_prompt: bool = ( add_function_to_prompt: bool = False # if function calling not supported by api, append function call details to system prompt
False # if function calling not supported by api, append function call details to system prompt
)
client_session: Optional[httpx.Client] = None client_session: Optional[httpx.Client] = None
aclient_session: Optional[httpx.AsyncClient] = None aclient_session: Optional[httpx.AsyncClient] = None
model_fallbacks: Optional[List] = None # Deprecated for 'litellm.fallbacks' model_fallbacks: Optional[List] = None # Deprecated for 'litellm.fallbacks'
model_cost_map_url: str = ( model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
"https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
)
suppress_debug_info = False suppress_debug_info = False
dynamodb_table_name: Optional[str] = None dynamodb_table_name: Optional[str] = None
s3_callback_params: Optional[Dict] = None s3_callback_params: Optional[Dict] = None
@ -293,9 +284,7 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = [] custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION #### #### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None priority_reservation: Optional[Dict[str, float]] = None
force_ipv4: bool = ( force_ipv4: bool = False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
)
module_level_aclient = AsyncHTTPHandler( module_level_aclient = AsyncHTTPHandler(
timeout=request_timeout, client_alias="module level aclient" timeout=request_timeout, client_alias="module level aclient"
) )
@ -309,13 +298,13 @@ fallbacks: Optional[List] = None
context_window_fallbacks: Optional[List] = None context_window_fallbacks: Optional[List] = None
content_policy_fallbacks: Optional[List] = None content_policy_fallbacks: Optional[List] = None
allowed_fails: int = 3 allowed_fails: int = 3
num_retries_per_request: Optional[int] = ( num_retries_per_request: Optional[
None # for the request overall (incl. fallbacks + model retries) int
) ] = None # for the request overall (incl. fallbacks + model retries)
####### SECRET MANAGERS ##################### ####### SECRET MANAGERS #####################
secret_manager_client: Optional[Any] = ( secret_manager_client: Optional[
None # list of instantiated key management clients - e.g. azure kv, infisical, etc. Any
) ] = None # list of instantiated key management clients - e.g. azure kv, infisical, etc.
_google_kms_resource_name: Optional[str] = None _google_kms_resource_name: Optional[str] = None
_key_management_system: Optional[KeyManagementSystem] = None _key_management_system: Optional[KeyManagementSystem] = None
_key_management_settings: KeyManagementSettings = KeyManagementSettings() _key_management_settings: KeyManagementSettings = KeyManagementSettings()
@ -416,7 +405,6 @@ deepseek_models: List = []
azure_ai_models: List = [] azure_ai_models: List = []
jina_ai_models: List = [] jina_ai_models: List = []
voyage_models: List = [] voyage_models: List = []
infinity_models: List = []
databricks_models: List = [] databricks_models: List = []
cloudflare_models: List = [] cloudflare_models: List = []
codestral_models: List = [] codestral_models: List = []
@ -558,8 +546,6 @@ def add_known_models():
azure_ai_models.append(key) azure_ai_models.append(key)
elif value.get("litellm_provider") == "voyage": elif value.get("litellm_provider") == "voyage":
voyage_models.append(key) voyage_models.append(key)
elif value.get("litellm_provider") == "infinity":
infinity_models.append(key)
elif value.get("litellm_provider") == "databricks": elif value.get("litellm_provider") == "databricks":
databricks_models.append(key) databricks_models.append(key)
elif value.get("litellm_provider") == "cloudflare": elif value.get("litellm_provider") == "cloudflare":
@ -648,7 +634,6 @@ model_list = (
+ deepseek_models + deepseek_models
+ azure_ai_models + azure_ai_models
+ voyage_models + voyage_models
+ infinity_models
+ databricks_models + databricks_models
+ cloudflare_models + cloudflare_models
+ codestral_models + codestral_models
@ -704,7 +689,6 @@ models_by_provider: dict = {
"mistral": mistral_chat_models, "mistral": mistral_chat_models,
"azure_ai": azure_ai_models, "azure_ai": azure_ai_models,
"voyage": voyage_models, "voyage": voyage_models,
"infinity": infinity_models,
"databricks": databricks_models, "databricks": databricks_models,
"cloudflare": cloudflare_models, "cloudflare": cloudflare_models,
"codestral": codestral_models, "codestral": codestral_models,
@ -952,11 +936,9 @@ from .llms.topaz.image_variations.transformation import TopazImageVariationConfi
from litellm.llms.openai.completion.transformation import OpenAITextCompletionConfig from litellm.llms.openai.completion.transformation import OpenAITextCompletionConfig
from .llms.groq.chat.transformation import GroqChatConfig from .llms.groq.chat.transformation import GroqChatConfig
from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
from .llms.infinity.embedding.transformation import InfinityEmbeddingConfig
from .llms.azure_ai.chat.transformation import AzureAIStudioConfig from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
from .llms.mistral.mistral_chat_transformation import MistralConfig from .llms.mistral.mistral_chat_transformation import MistralConfig
from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from .llms.azure.responses.transformation import AzureOpenAIResponsesAPIConfig
from .llms.openai.chat.o_series_transformation import ( from .llms.openai.chat.o_series_transformation import (
OpenAIOSeriesConfig as OpenAIO1Config, # maintain backwards compatibility OpenAIOSeriesConfig as OpenAIO1Config, # maintain backwards compatibility
OpenAIOSeriesConfig, OpenAIOSeriesConfig,
@ -1073,10 +1055,10 @@ from .types.llms.custom_llm import CustomLLMItem
from .types.utils import GenericStreamingChunk from .types.utils import GenericStreamingChunk
custom_provider_map: List[CustomLLMItem] = [] custom_provider_map: List[CustomLLMItem] = []
_custom_providers: List[str] = ( _custom_providers: List[
[] str
) # internal helper util, used to track names of custom providers ] = [] # internal helper util, used to track names of custom providers
disable_hf_tokenizer_download: Optional[bool] = ( disable_hf_tokenizer_download: Optional[
None # disable huggingface tokenizer download. Defaults to openai clk100 bool
) ] = None # disable huggingface tokenizer download. Defaults to openai clk100
global_disable_no_log_param: bool = False global_disable_no_log_param: bool = False

View file

@ -304,11 +304,6 @@ def create_assistants(
"response_format": response_format, "response_format": response_format,
} }
# only send params that are not None
create_assistant_data = {
k: v for k, v in create_assistant_data.items() if v is not None
}
response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None
if custom_llm_provider == "openai": if custom_llm_provider == "openai":
api_base = ( api_base = (

View file

@ -21,10 +21,6 @@ DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
########## Networking constants ############################################################## ########## Networking constants ##############################################################
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour _DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour

View file

@ -57,7 +57,6 @@ from litellm.llms.vertex_ai.image_generation.cost_calculator import (
from litellm.responses.utils import ResponseAPILoggingUtils from litellm.responses.utils import ResponseAPILoggingUtils
from litellm.types.llms.openai import ( from litellm.types.llms.openai import (
HttpxBinaryResponseContent, HttpxBinaryResponseContent,
ImageGenerationRequestQuality,
OpenAIRealtimeStreamList, OpenAIRealtimeStreamList,
OpenAIRealtimeStreamResponseBaseObject, OpenAIRealtimeStreamResponseBaseObject,
OpenAIRealtimeStreamSessionEvents, OpenAIRealtimeStreamSessionEvents,
@ -643,9 +642,9 @@ def completion_cost( # noqa: PLR0915
or isinstance(completion_response, dict) or isinstance(completion_response, dict)
): # tts returns a custom class ): # tts returns a custom class
if isinstance(completion_response, dict): if isinstance(completion_response, dict):
usage_obj: Optional[Union[dict, Usage]] = ( usage_obj: Optional[
completion_response.get("usage", {}) Union[dict, Usage]
) ] = completion_response.get("usage", {})
else: else:
usage_obj = getattr(completion_response, "usage", {}) usage_obj = getattr(completion_response, "usage", {})
if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects( if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
@ -914,7 +913,7 @@ def completion_cost( # noqa: PLR0915
def get_response_cost_from_hidden_params( def get_response_cost_from_hidden_params(
hidden_params: Union[dict, BaseModel], hidden_params: Union[dict, BaseModel]
) -> Optional[float]: ) -> Optional[float]:
if isinstance(hidden_params, BaseModel): if isinstance(hidden_params, BaseModel):
_hidden_params_dict = hidden_params.model_dump() _hidden_params_dict = hidden_params.model_dump()
@ -1102,36 +1101,29 @@ def default_image_cost_calculator(
f"{quality}/{base_model_name}" if quality else base_model_name f"{quality}/{base_model_name}" if quality else base_model_name
) )
# gpt-image-1 models use low, medium, high quality. If user did not specify quality, use medium fot gpt-image-1 model family
model_name_with_v2_quality = (
f"{ImageGenerationRequestQuality.MEDIUM.value}/{base_model_name}"
)
verbose_logger.debug( verbose_logger.debug(
f"Looking up cost for models: {model_name_with_quality}, {base_model_name}" f"Looking up cost for models: {model_name_with_quality}, {base_model_name}"
) )
# Try model with quality first, fall back to base model name
if model_name_with_quality in litellm.model_cost:
cost_info = litellm.model_cost[model_name_with_quality]
elif base_model_name in litellm.model_cost:
cost_info = litellm.model_cost[base_model_name]
else:
# Try without provider prefix
model_without_provider = f"{size_str}/{model.split('/')[-1]}" model_without_provider = f"{size_str}/{model.split('/')[-1]}"
model_with_quality_without_provider = ( model_with_quality_without_provider = (
f"{quality}/{model_without_provider}" if quality else model_without_provider f"{quality}/{model_without_provider}" if quality else model_without_provider
) )
# Try model with quality first, fall back to base model name if model_with_quality_without_provider in litellm.model_cost:
cost_info: Optional[dict] = None cost_info = litellm.model_cost[model_with_quality_without_provider]
models_to_check = [ elif model_without_provider in litellm.model_cost:
model_name_with_quality, cost_info = litellm.model_cost[model_without_provider]
base_model_name, else:
model_name_with_v2_quality,
model_with_quality_without_provider,
model_without_provider,
]
for model in models_to_check:
if model in litellm.model_cost:
cost_info = litellm.model_cost[model]
break
if cost_info is None:
raise Exception( raise Exception(
f"Model not found in cost map. Tried checking {models_to_check}" f"Model not found in cost map. Tried {model_name_with_quality}, {base_model_name}, {model_with_quality_without_provider}, and {model_without_provider}"
) )
return cost_info["input_cost_per_pixel"] * height * width * n return cost_info["input_cost_per_pixel"] * height * width * n

View file

@ -45,14 +45,6 @@ class SpanAttributes:
""" """
The name of the model being used. The name of the model being used.
""" """
LLM_PROVIDER = "llm.provider"
"""
The provider of the model, such as OpenAI, Azure, Google, etc.
"""
LLM_SYSTEM = "llm.system"
"""
The AI product as identified by the client or server
"""
LLM_PROMPTS = "llm.prompts" LLM_PROMPTS = "llm.prompts"
""" """
Prompts provided to a completions API. Prompts provided to a completions API.
@ -73,40 +65,15 @@ class SpanAttributes:
""" """
Number of tokens in the prompt. Number of tokens in the prompt.
""" """
LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_WRITE = "llm.token_count.prompt_details.cache_write"
"""
Number of tokens in the prompt that were written to cache.
"""
LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_READ = "llm.token_count.prompt_details.cache_read"
"""
Number of tokens in the prompt that were read from cache.
"""
LLM_TOKEN_COUNT_PROMPT_DETAILS_AUDIO = "llm.token_count.prompt_details.audio"
"""
The number of audio input tokens presented in the prompt
"""
LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion" LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
""" """
Number of tokens in the completion. Number of tokens in the completion.
""" """
LLM_TOKEN_COUNT_COMPLETION_DETAILS_REASONING = "llm.token_count.completion_details.reasoning"
"""
Number of tokens used for reasoning steps in the completion.
"""
LLM_TOKEN_COUNT_COMPLETION_DETAILS_AUDIO = "llm.token_count.completion_details.audio"
"""
The number of audio input tokens generated by the model
"""
LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total" LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
""" """
Total number of tokens, including both prompt and completion. Total number of tokens, including both prompt and completion.
""" """
LLM_TOOLS = "llm.tools"
"""
List of tools that are advertised to the LLM to be able to call
"""
TOOL_NAME = "tool.name" TOOL_NAME = "tool.name"
""" """
Name of the tool being used. Name of the tool being used.
@ -145,19 +112,6 @@ class SpanAttributes:
The id of the user The id of the user
""" """
PROMPT_VENDOR = "prompt.vendor"
"""
The vendor or origin of the prompt, e.g. a prompt library, a specialized service, etc.
"""
PROMPT_ID = "prompt.id"
"""
A vendor-specific id used to locate the prompt.
"""
PROMPT_URL = "prompt.url"
"""
A vendor-specific url used to locate the prompt.
"""
class MessageAttributes: class MessageAttributes:
""" """
@ -197,10 +151,6 @@ class MessageAttributes:
The JSON string representing the arguments passed to the function The JSON string representing the arguments passed to the function
during a function call. during a function call.
""" """
MESSAGE_TOOL_CALL_ID = "message.tool_call_id"
"""
The id of the tool call.
"""
class MessageContentAttributes: class MessageContentAttributes:
@ -236,25 +186,6 @@ class ImageAttributes:
""" """
class AudioAttributes:
"""
Attributes for audio
"""
AUDIO_URL = "audio.url"
"""
The url to an audio file
"""
AUDIO_MIME_TYPE = "audio.mime_type"
"""
The mime type of the audio file
"""
AUDIO_TRANSCRIPT = "audio.transcript"
"""
The transcript of the audio file
"""
class DocumentAttributes: class DocumentAttributes:
""" """
Attributes for a document. Attributes for a document.
@ -326,10 +257,6 @@ class ToolCallAttributes:
Attributes for a tool call Attributes for a tool call
""" """
TOOL_CALL_ID = "tool_call.id"
"""
The id of the tool call.
"""
TOOL_CALL_FUNCTION_NAME = "tool_call.function.name" TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
""" """
The name of function that is being called during a tool call. The name of function that is being called during a tool call.
@ -341,18 +268,6 @@ class ToolCallAttributes:
""" """
class ToolAttributes:
"""
Attributes for a tools
"""
TOOL_JSON_SCHEMA = "tool.json_schema"
"""
The json schema of a tool input, It is RECOMMENDED that this be in the
OpenAI tool calling format: https://platform.openai.com/docs/assistants/tools
"""
class OpenInferenceSpanKindValues(Enum): class OpenInferenceSpanKindValues(Enum):
TOOL = "TOOL" TOOL = "TOOL"
CHAIN = "CHAIN" CHAIN = "CHAIN"
@ -369,21 +284,3 @@ class OpenInferenceSpanKindValues(Enum):
class OpenInferenceMimeTypeValues(Enum): class OpenInferenceMimeTypeValues(Enum):
TEXT = "text/plain" TEXT = "text/plain"
JSON = "application/json" JSON = "application/json"
class OpenInferenceLLMSystemValues(Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
COHERE = "cohere"
MISTRALAI = "mistralai"
VERTEXAI = "vertexai"
class OpenInferenceLLMProviderValues(Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
COHERE = "cohere"
MISTRALAI = "mistralai"
GOOGLE = "google"
AZURE = "azure"
AWS = "aws"

View file

@ -1,3 +0,0 @@
from .agentops import AgentOps
__all__ = ["AgentOps"]

View file

@ -1,118 +0,0 @@
"""
AgentOps integration for LiteLLM - Provides OpenTelemetry tracing for LLM calls
"""
import os
from dataclasses import dataclass
from typing import Optional, Dict, Any
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@dataclass
class AgentOpsConfig:
endpoint: str = "https://otlp.agentops.cloud/v1/traces"
api_key: Optional[str] = None
service_name: Optional[str] = None
deployment_environment: Optional[str] = None
auth_endpoint: str = "https://api.agentops.ai/v3/auth/token"
@classmethod
def from_env(cls):
return cls(
endpoint="https://otlp.agentops.cloud/v1/traces",
api_key=os.getenv("AGENTOPS_API_KEY"),
service_name=os.getenv("AGENTOPS_SERVICE_NAME", "agentops"),
deployment_environment=os.getenv("AGENTOPS_ENVIRONMENT", "production"),
auth_endpoint="https://api.agentops.ai/v3/auth/token"
)
class AgentOps(OpenTelemetry):
"""
AgentOps integration - built on top of OpenTelemetry
Example usage:
```python
import litellm
litellm.success_callback = ["agentops"]
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello, how are you?"}],
)
```
"""
def __init__(
self,
config: Optional[AgentOpsConfig] = None,
):
if config is None:
config = AgentOpsConfig.from_env()
# Prefetch JWT token for authentication
jwt_token = None
project_id = None
if config.api_key:
try:
response = self._fetch_auth_token(config.api_key, config.auth_endpoint)
jwt_token = response.get("token")
project_id = response.get("project_id")
except Exception:
pass
headers = f"Authorization=Bearer {jwt_token}" if jwt_token else None
otel_config = OpenTelemetryConfig(
exporter="otlp_http",
endpoint=config.endpoint,
headers=headers
)
# Initialize OpenTelemetry with our config
super().__init__(
config=otel_config,
callback_name="agentops"
)
# Set AgentOps-specific resource attributes
resource_attrs = {
"service.name": config.service_name or "litellm",
"deployment.environment": config.deployment_environment or "production",
"telemetry.sdk.name": "agentops",
}
if project_id:
resource_attrs["project.id"] = project_id
self.resource_attributes = resource_attrs
def _fetch_auth_token(self, api_key: str, auth_endpoint: str) -> Dict[str, Any]:
"""
Fetch JWT authentication token from AgentOps API
Args:
api_key: AgentOps API key
auth_endpoint: Authentication endpoint
Returns:
Dict containing JWT token and project ID
"""
headers = {
"Content-Type": "application/json",
"Connection": "keep-alive",
}
client = _get_httpx_client()
try:
response = client.post(
url=auth_endpoint,
headers=headers,
json={"api_key": api_key},
timeout=10
)
if response.status_code != 200:
raise Exception(f"Failed to fetch auth token: {response.text}")
return response.json()
finally:
client.close()

View file

@ -1,4 +1,3 @@
import json
from typing import TYPE_CHECKING, Any, Optional, Union from typing import TYPE_CHECKING, Any, Optional, Union
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
@ -13,141 +12,36 @@ else:
Span = Any Span = Any
def cast_as_primitive_value_type(value) -> Union[str, bool, int, float]: def set_attributes(span: Span, kwargs, response_obj):
"""
Converts a value to an OTEL-supported primitive for Arize/Phoenix observability.
"""
if value is None:
return ""
if isinstance(value, (str, bool, int, float)):
return value
try:
return str(value)
except Exception:
return ""
def safe_set_attribute(span: Span, key: str, value: Any):
"""
Sets a span attribute safely with OTEL-compliant primitive typing for Arize/Phoenix.
"""
primitive_value = cast_as_primitive_value_type(value)
span.set_attribute(key, primitive_value)
def set_attributes(span: Span, kwargs, response_obj): # noqa: PLR0915
"""
Populates span with OpenInference-compliant LLM attributes for Arize and Phoenix tracing.
"""
from litellm.integrations._types.open_inference import ( from litellm.integrations._types.open_inference import (
MessageAttributes, MessageAttributes,
OpenInferenceSpanKindValues, OpenInferenceSpanKindValues,
SpanAttributes, SpanAttributes,
ToolCallAttributes,
) )
try: try:
optional_params = kwargs.get("optional_params", {})
litellm_params = kwargs.get("litellm_params", {})
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get( standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object" "standard_logging_object"
) )
if standard_logging_payload is None:
raise ValueError("standard_logging_object not found in kwargs")
############################################# #############################################
############ LLM CALL METADATA ############## ############ LLM CALL METADATA ##############
############################################# #############################################
# Set custom metadata for observability and trace enrichment. if standard_logging_payload and (
metadata = ( metadata := standard_logging_payload["metadata"]
standard_logging_payload.get("metadata") ):
if standard_logging_payload span.set_attribute(SpanAttributes.METADATA, safe_dumps(metadata))
else None
)
if metadata is not None:
safe_set_attribute(span, SpanAttributes.METADATA, safe_dumps(metadata))
############################################# #############################################
########## LLM Request Attributes ########### ########## LLM Request Attributes ###########
############################################# #############################################
# The name of the LLM a request is being made to. # The name of the LLM a request is being made to
if kwargs.get("model"): if kwargs.get("model"):
safe_set_attribute( span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
span,
SpanAttributes.LLM_MODEL_NAME,
kwargs.get("model"),
)
# The LLM request type. span.set_attribute(
safe_set_attribute(
span,
"llm.request.type",
standard_logging_payload["call_type"],
)
# The Generative AI Provider: Azure, OpenAI, etc.
safe_set_attribute(
span,
SpanAttributes.LLM_PROVIDER,
litellm_params.get("custom_llm_provider", "Unknown"),
)
# The maximum number of tokens the LLM generates for a request.
if optional_params.get("max_tokens"):
safe_set_attribute(
span,
"llm.request.max_tokens",
optional_params.get("max_tokens"),
)
# The temperature setting for the LLM request.
if optional_params.get("temperature"):
safe_set_attribute(
span,
"llm.request.temperature",
optional_params.get("temperature"),
)
# The top_p sampling setting for the LLM request.
if optional_params.get("top_p"):
safe_set_attribute(
span,
"llm.request.top_p",
optional_params.get("top_p"),
)
# Indicates whether response is streamed.
safe_set_attribute(
span,
"llm.is_streaming",
str(optional_params.get("stream", False)),
)
# Logs the user ID if present.
if optional_params.get("user"):
safe_set_attribute(
span,
"llm.user",
optional_params.get("user"),
)
# The unique identifier for the completion.
if response_obj and response_obj.get("id"):
safe_set_attribute(span, "llm.response.id", response_obj.get("id"))
# The model used to generate the response.
if response_obj and response_obj.get("model"):
safe_set_attribute(
span,
"llm.response.model",
response_obj.get("model"),
)
# Required by OpenInference to mark span as LLM kind.
safe_set_attribute(
span,
SpanAttributes.OPENINFERENCE_SPAN_KIND, SpanAttributes.OPENINFERENCE_SPAN_KIND,
OpenInferenceSpanKindValues.LLM.value, OpenInferenceSpanKindValues.LLM.value,
) )
@ -156,132 +50,77 @@ def set_attributes(span: Span, kwargs, response_obj): # noqa: PLR0915
# for /chat/completions # for /chat/completions
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
if messages: if messages:
last_message = messages[-1] span.set_attribute(
safe_set_attribute(
span,
SpanAttributes.INPUT_VALUE, SpanAttributes.INPUT_VALUE,
last_message.get("content", ""), messages[-1].get("content", ""), # get the last message for input
) )
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page. # LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
for idx, msg in enumerate(messages): for idx, msg in enumerate(messages):
prefix = f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}" # Set the role per message
# Set the role per message. span.set_attribute(
safe_set_attribute( f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
span, f"{prefix}.{MessageAttributes.MESSAGE_ROLE}", msg.get("role") msg["role"],
) )
# Set the content per message. # Set the content per message
safe_set_attribute( span.set_attribute(
span, f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
f"{prefix}.{MessageAttributes.MESSAGE_CONTENT}",
msg.get("content", ""), msg.get("content", ""),
) )
# Capture tools (function definitions) used in the LLM call. if standard_logging_payload and (
tools = optional_params.get("tools") model_params := standard_logging_payload["model_parameters"]
if tools: ):
for idx, tool in enumerate(tools):
function = tool.get("function")
if not function:
continue
prefix = f"{SpanAttributes.LLM_TOOLS}.{idx}"
safe_set_attribute(
span, f"{prefix}.{SpanAttributes.TOOL_NAME}", function.get("name")
)
safe_set_attribute(
span,
f"{prefix}.{SpanAttributes.TOOL_DESCRIPTION}",
function.get("description"),
)
safe_set_attribute(
span,
f"{prefix}.{SpanAttributes.TOOL_PARAMETERS}",
json.dumps(function.get("parameters")),
)
# Capture tool calls made during function-calling LLM flows.
functions = optional_params.get("functions")
if functions:
for idx, function in enumerate(functions):
prefix = f"{MessageAttributes.MESSAGE_TOOL_CALLS}.{idx}"
safe_set_attribute(
span,
f"{prefix}.{ToolCallAttributes.TOOL_CALL_FUNCTION_NAME}",
function.get("name"),
)
# Capture invocation parameters and user ID if available.
model_params = (
standard_logging_payload.get("model_parameters")
if standard_logging_payload
else None
)
if model_params:
# The Generative AI Provider: Azure, OpenAI, etc. # The Generative AI Provider: Azure, OpenAI, etc.
safe_set_attribute( span.set_attribute(
span, SpanAttributes.LLM_INVOCATION_PARAMETERS, safe_dumps(model_params)
SpanAttributes.LLM_INVOCATION_PARAMETERS,
safe_dumps(model_params),
) )
if model_params.get("user"): if model_params.get("user"):
user_id = model_params.get("user") user_id = model_params.get("user")
if user_id is not None: if user_id is not None:
safe_set_attribute(span, SpanAttributes.USER_ID, user_id) span.set_attribute(SpanAttributes.USER_ID, user_id)
############################################# #############################################
########## LLM Response Attributes ########## ########## LLM Response Attributes ##########
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
############################################# #############################################
# Captures response tokens, message, and content.
if hasattr(response_obj, "get"): if hasattr(response_obj, "get"):
for idx, choice in enumerate(response_obj.get("choices", [])): for choice in response_obj.get("choices", []):
response_message = choice.get("message", {}) response_message = choice.get("message", {})
safe_set_attribute( span.set_attribute(
span, SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
SpanAttributes.OUTPUT_VALUE,
response_message.get("content", ""),
) )
# This shows up under `output_messages` tab on the span page. # This shows up under `output_messages` tab on the span page
prefix = f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.{idx}" # This code assumes a single response
safe_set_attribute( span.set_attribute(
span, f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
f"{prefix}.{MessageAttributes.MESSAGE_ROLE}",
response_message.get("role"), response_message.get("role"),
) )
safe_set_attribute( span.set_attribute(
span, f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
f"{prefix}.{MessageAttributes.MESSAGE_CONTENT}",
response_message.get("content", ""), response_message.get("content", ""),
) )
# Token usage info. usage = response_obj.get("usage")
usage = response_obj and response_obj.get("usage")
if usage: if usage:
safe_set_attribute( span.set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_TOTAL, SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
usage.get("total_tokens"), usage.get("total_tokens"),
) )
# The number of tokens used in the LLM response (completion). # The number of tokens used in the LLM response (completion).
safe_set_attribute( span.set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_COMPLETION, SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
usage.get("completion_tokens"), usage.get("completion_tokens"),
) )
# The number of tokens used in the LLM prompt. # The number of tokens used in the LLM prompt.
safe_set_attribute( span.set_attribute(
span,
SpanAttributes.LLM_TOKEN_COUNT_PROMPT, SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
usage.get("prompt_tokens"), usage.get("prompt_tokens"),
) )
pass
except Exception as e: except Exception as e:
verbose_logger.error( verbose_logger.error(f"Error setting arize attributes: {e}")
f"[Arize/Phoenix] Failed to set OpenInference span attributes: {e}"
)
if hasattr(span, "record_exception"):
span.record_exception(e)

View file

@ -13,15 +13,10 @@ import uuid
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
import httpx
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.datadog.datadog import DataDogLogger from litellm.integrations.datadog.datadog import DataDogLogger
from litellm.litellm_core_utils.prompt_templates.common_utils import (
handle_any_messages_to_chat_completion_str_messages_conversion,
)
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client, get_async_httpx_client,
httpxSpecialProvider, httpxSpecialProvider,
@ -111,6 +106,7 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
}, },
) )
response.raise_for_status()
if response.status_code != 202: if response.status_code != 202:
raise Exception( raise Exception(
f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}" f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
@ -120,10 +116,6 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}" f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
) )
self.log_queue.clear() self.log_queue.clear()
except httpx.HTTPStatusError as e:
verbose_logger.exception(
f"DataDogLLMObs: Error sending batch - {e.response.text}"
)
except Exception as e: except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}") verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
@ -141,11 +133,7 @@ class DataDogLLMObsLogger(DataDogLogger, CustomBatchLogger):
metadata = kwargs.get("litellm_params", {}).get("metadata", {}) metadata = kwargs.get("litellm_params", {}).get("metadata", {})
input_meta = InputMeta( input_meta = InputMeta(messages=messages) # type: ignore
messages=handle_any_messages_to_chat_completion_str_messages_conversion(
messages
)
)
output_meta = OutputMeta(messages=self._get_response_messages(response_obj)) output_meta = OutputMeta(messages=self._get_response_messages(response_obj))
meta = Meta( meta = Meta(

View file

@ -1000,9 +1000,9 @@ class PrometheusLogger(CustomLogger):
): ):
try: try:
verbose_logger.debug("setting remaining tokens requests metric") verbose_logger.debug("setting remaining tokens requests metric")
standard_logging_payload: Optional[ standard_logging_payload: Optional[StandardLoggingPayload] = (
StandardLoggingPayload request_kwargs.get("standard_logging_object")
] = request_kwargs.get("standard_logging_object") )
if standard_logging_payload is None: if standard_logging_payload is None:
return return
@ -1453,7 +1453,6 @@ class PrometheusLogger(CustomLogger):
user_id=None, user_id=None,
team_id=None, team_id=None,
key_alias=None, key_alias=None,
key_hash=None,
exclude_team_id=UI_SESSION_TOKEN_TEAM_ID, exclude_team_id=UI_SESSION_TOKEN_TEAM_ID,
return_full_object=True, return_full_object=True,
organization_id=None, organization_id=None,
@ -1772,11 +1771,11 @@ class PrometheusLogger(CustomLogger):
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.prometheus import PrometheusLogger from litellm.integrations.prometheus import PrometheusLogger
prometheus_loggers: List[ prometheus_loggers: List[CustomLogger] = (
CustomLogger litellm.logging_callback_manager.get_custom_loggers_for_type(
] = litellm.logging_callback_manager.get_custom_loggers_for_type(
callback_type=PrometheusLogger callback_type=PrometheusLogger
) )
)
# we need to get the initialized prometheus logger instance(s) and call logger.initialize_remaining_budget_metrics() on them # we need to get the initialized prometheus logger instance(s) and call logger.initialize_remaining_budget_metrics() on them
verbose_logger.debug("found %s prometheus loggers", len(prometheus_loggers)) verbose_logger.debug("found %s prometheus loggers", len(prometheus_loggers))
if len(prometheus_loggers) > 0: if len(prometheus_loggers) > 0:

View file

@ -311,9 +311,6 @@ def exception_type( # type: ignore # noqa: PLR0915
elif ( elif (
"invalid_request_error" in error_str "invalid_request_error" in error_str
and "content_policy_violation" in error_str and "content_policy_violation" in error_str
) or (
"Invalid prompt" in error_str
and "violating our usage policy" in error_str
): ):
exception_mapping_worked = True exception_mapping_worked = True
raise ContentPolicyViolationError( raise ContentPolicyViolationError(

View file

@ -221,8 +221,6 @@ def get_supported_openai_params( # noqa: PLR0915
return litellm.PredibaseConfig().get_supported_openai_params(model=model) return litellm.PredibaseConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "voyage": elif custom_llm_provider == "voyage":
return litellm.VoyageEmbeddingConfig().get_supported_openai_params(model=model) return litellm.VoyageEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "infinity":
return litellm.InfinityEmbeddingConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "triton": elif custom_llm_provider == "triton":
if request_type == "embeddings": if request_type == "embeddings":
return litellm.TritonEmbeddingConfig().get_supported_openai_params( return litellm.TritonEmbeddingConfig().get_supported_openai_params(

View file

@ -36,7 +36,6 @@ from litellm.cost_calculator import (
RealtimeAPITokenUsageProcessor, RealtimeAPITokenUsageProcessor,
_select_model_name_for_cost_calc, _select_model_name_for_cost_calc,
) )
from litellm.integrations.agentops import AgentOps
from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
from litellm.integrations.arize.arize import ArizeLogger from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_guardrail import CustomGuardrail
@ -248,7 +247,7 @@ class Logging(LiteLLMLoggingBaseClass):
self.start_time = start_time # log the call start time self.start_time = start_time # log the call start time
self.call_type = call_type self.call_type = call_type
self.litellm_call_id = litellm_call_id self.litellm_call_id = litellm_call_id
self.litellm_trace_id: str = litellm_trace_id or str(uuid.uuid4()) self.litellm_trace_id = litellm_trace_id
self.function_id = function_id self.function_id = function_id
self.streaming_chunks: List[Any] = [] # for generating complete stream response self.streaming_chunks: List[Any] = [] # for generating complete stream response
self.sync_streaming_chunks: List[Any] = ( self.sync_streaming_chunks: List[Any] = (
@ -2686,15 +2685,7 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
""" """
try: try:
custom_logger_init_args = custom_logger_init_args or {} custom_logger_init_args = custom_logger_init_args or {}
if logging_integration == "agentops": # Add AgentOps initialization if logging_integration == "lago":
for callback in _in_memory_loggers:
if isinstance(callback, AgentOps):
return callback # type: ignore
agentops_logger = AgentOps()
_in_memory_loggers.append(agentops_logger)
return agentops_logger # type: ignore
elif logging_integration == "lago":
for callback in _in_memory_loggers: for callback in _in_memory_loggers:
if isinstance(callback, LagoLogger): if isinstance(callback, LagoLogger):
return callback # type: ignore return callback # type: ignore
@ -3499,21 +3490,6 @@ class StandardLoggingPayloadSetup:
else: else:
return end_time_float - start_time_float return end_time_float - start_time_float
@staticmethod
def _get_standard_logging_payload_trace_id(
logging_obj: Logging,
litellm_params: dict,
) -> str:
"""
Returns the `litellm_trace_id` for this request
This helps link sessions when multiple requests are made in a single session
"""
dynamic_trace_id = litellm_params.get("litellm_trace_id")
if dynamic_trace_id:
return str(dynamic_trace_id)
return logging_obj.litellm_trace_id
def get_standard_logging_object_payload( def get_standard_logging_object_payload(
kwargs: Optional[dict], kwargs: Optional[dict],
@ -3666,10 +3642,7 @@ def get_standard_logging_object_payload(
payload: StandardLoggingPayload = StandardLoggingPayload( payload: StandardLoggingPayload = StandardLoggingPayload(
id=str(id), id=str(id),
trace_id=StandardLoggingPayloadSetup._get_standard_logging_payload_trace_id( trace_id=kwargs.get("litellm_trace_id"), # type: ignore
logging_obj=logging_obj,
litellm_params=litellm_params,
),
call_type=call_type or "", call_type=call_type or "",
cache_hit=cache_hit, cache_hit=cache_hit,
stream=stream, stream=stream,

View file

@ -265,10 +265,8 @@ def generic_cost_per_token(
) )
## CALCULATE OUTPUT COST ## CALCULATE OUTPUT COST
text_tokens = 0 text_tokens = usage.completion_tokens
audio_tokens = 0 audio_tokens = 0
reasoning_tokens = 0
is_text_tokens_total = False
if usage.completion_tokens_details is not None: if usage.completion_tokens_details is not None:
audio_tokens = ( audio_tokens = (
cast( cast(
@ -282,20 +280,9 @@ def generic_cost_per_token(
Optional[int], Optional[int],
getattr(usage.completion_tokens_details, "text_tokens", None), getattr(usage.completion_tokens_details, "text_tokens", None),
) )
or 0 # default to completion tokens, if this field is not set or usage.completion_tokens # default to completion tokens, if this field is not set
)
reasoning_tokens = (
cast(
Optional[int],
getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
)
or 0
) )
if text_tokens == 0:
text_tokens = usage.completion_tokens
if text_tokens == usage.completion_tokens:
is_text_tokens_total = True
## TEXT COST ## TEXT COST
completion_cost = float(text_tokens) * completion_base_cost completion_cost = float(text_tokens) * completion_base_cost
@ -303,26 +290,12 @@ def generic_cost_per_token(
"output_cost_per_audio_token" "output_cost_per_audio_token"
) )
_output_cost_per_reasoning_token: Optional[float] = model_info.get(
"output_cost_per_reasoning_token"
)
## AUDIO COST ## AUDIO COST
if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0: if (
_output_cost_per_audio_token = ( _output_cost_per_audio_token is not None
_output_cost_per_audio_token and audio_tokens is not None
if _output_cost_per_audio_token is not None and audio_tokens > 0
else completion_base_cost ):
)
completion_cost += float(audio_tokens) * _output_cost_per_audio_token completion_cost += float(audio_tokens) * _output_cost_per_audio_token
## REASONING COST
if not is_text_tokens_total and reasoning_tokens and reasoning_tokens > 0:
_output_cost_per_reasoning_token = (
_output_cost_per_reasoning_token
if _output_cost_per_reasoning_token is not None
else completion_base_cost
)
completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
return prompt_cost, completion_cost return prompt_cost, completion_cost

View file

@ -14,7 +14,6 @@ from litellm.types.llms.openai import ChatCompletionThinkingBlock
from litellm.types.utils import ( from litellm.types.utils import (
ChatCompletionDeltaToolCall, ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall, ChatCompletionMessageToolCall,
ChatCompletionRedactedThinkingBlock,
Choices, Choices,
Delta, Delta,
EmbeddingResponse, EmbeddingResponse,
@ -487,14 +486,7 @@ def convert_to_model_response_object( # noqa: PLR0915
) )
# Handle thinking models that display `thinking_blocks` within `content` # Handle thinking models that display `thinking_blocks` within `content`
thinking_blocks: Optional[ thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
List[
Union[
ChatCompletionThinkingBlock,
ChatCompletionRedactedThinkingBlock,
]
]
] = None
if "thinking_blocks" in choice["message"]: if "thinking_blocks" in choice["message"]:
thinking_blocks = choice["message"]["thinking_blocks"] thinking_blocks = choice["message"]["thinking_blocks"]
provider_specific_fields["thinking_blocks"] = thinking_blocks provider_specific_fields["thinking_blocks"] = thinking_blocks

View file

@ -75,10 +75,6 @@ class ModelParamHelper:
combined_kwargs = combined_kwargs.difference(exclude_kwargs) combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs return combined_kwargs
@staticmethod
def get_litellm_provider_specific_params_for_chat_params() -> Set[str]:
return set(["thinking"])
@staticmethod @staticmethod
def _get_litellm_supported_chat_completion_kwargs() -> Set[str]: def _get_litellm_supported_chat_completion_kwargs() -> Set[str]:
""" """
@ -86,18 +82,11 @@ class ModelParamHelper:
This follows the OpenAI API Spec This follows the OpenAI API Spec
""" """
non_streaming_params: Set[str] = set( all_chat_completion_kwargs = set(
getattr(CompletionCreateParamsNonStreaming, "__annotations__", {}).keys() getattr(CompletionCreateParamsNonStreaming, "__annotations__", {}).keys()
).union(
set(getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys())
) )
streaming_params: Set[str] = set(
getattr(CompletionCreateParamsStreaming, "__annotations__", {}).keys()
)
litellm_provider_specific_params: Set[str] = (
ModelParamHelper.get_litellm_provider_specific_params_for_chat_params()
)
all_chat_completion_kwargs: Set[str] = non_streaming_params.union(
streaming_params
).union(litellm_provider_specific_params)
return all_chat_completion_kwargs return all_chat_completion_kwargs
@staticmethod @staticmethod

View file

@ -6,7 +6,7 @@ import io
import mimetypes import mimetypes
import re import re
from os import PathLike from os import PathLike
from typing import Any, Dict, List, Literal, Mapping, Optional, Union, cast from typing import Dict, List, Literal, Mapping, Optional, Union, cast
from litellm.types.llms.openai import ( from litellm.types.llms.openai import (
AllMessageValues, AllMessageValues,
@ -32,35 +32,6 @@ DEFAULT_ASSISTANT_CONTINUE_MESSAGE = ChatCompletionAssistantMessage(
) )
def handle_any_messages_to_chat_completion_str_messages_conversion(
messages: Any,
) -> List[Dict[str, str]]:
"""
Handles any messages to chat completion str messages conversion
Relevant Issue: https://github.com/BerriAI/litellm/issues/9494
"""
import json
if isinstance(messages, list):
try:
return cast(
List[Dict[str, str]],
handle_messages_with_content_list_to_str_conversion(messages),
)
except Exception:
return [{"input": json.dumps(message, default=str)} for message in messages]
elif isinstance(messages, dict):
try:
return [{"input": json.dumps(messages, default=str)}]
except Exception:
return [{"input": str(messages)}]
elif isinstance(messages, str):
return [{"input": messages}]
else:
return [{"input": str(messages)}]
def handle_messages_with_content_list_to_str_conversion( def handle_messages_with_content_list_to_str_conversion(
messages: List[AllMessageValues], messages: List[AllMessageValues],
) -> List[AllMessageValues]: ) -> List[AllMessageValues]:
@ -500,59 +471,3 @@ def unpack_defs(schema, defs):
unpack_defs(ref, defs) unpack_defs(ref, defs)
value["items"] = ref value["items"] = ref
continue continue
def _get_image_mime_type_from_url(url: str) -> Optional[str]:
"""
Get mime type for common image URLs
See gemini mime types: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#image-requirements
Supported by Gemini:
application/pdf
audio/mpeg
audio/mp3
audio/wav
image/png
image/jpeg
image/webp
text/plain
video/mov
video/mpeg
video/mp4
video/mpg
video/avi
video/wmv
video/mpegps
video/flv
"""
url = url.lower()
# Map file extensions to mime types
mime_types = {
# Images
(".jpg", ".jpeg"): "image/jpeg",
(".png",): "image/png",
(".webp",): "image/webp",
# Videos
(".mp4",): "video/mp4",
(".mov",): "video/mov",
(".mpeg", ".mpg"): "video/mpeg",
(".avi",): "video/avi",
(".wmv",): "video/wmv",
(".mpegps",): "video/mpegps",
(".flv",): "video/flv",
# Audio
(".mp3",): "audio/mp3",
(".wav",): "audio/wav",
(".mpeg",): "audio/mpeg",
# Documents
(".pdf",): "application/pdf",
(".txt",): "text/plain",
}
# Check each extension group against the URL
for extensions, mime_type in mime_types.items():
if any(url.endswith(ext) for ext in extensions):
return mime_type
return None

View file

@ -2258,14 +2258,6 @@ def _parse_content_type(content_type: str) -> str:
return m.get_content_type() return m.get_content_type()
def _parse_mime_type(base64_data: str) -> Optional[str]:
mime_type_match = re.match(r"data:(.*?);base64", base64_data)
if mime_type_match:
return mime_type_match.group(1)
else:
return None
class BedrockImageProcessor: class BedrockImageProcessor:
"""Handles both sync and async image processing for Bedrock conversations.""" """Handles both sync and async image processing for Bedrock conversations."""

View file

@ -348,17 +348,11 @@ class ChunkProcessor:
and usage_chunk_dict["completion_tokens"] > 0 and usage_chunk_dict["completion_tokens"] > 0
): ):
completion_tokens = usage_chunk_dict["completion_tokens"] completion_tokens = usage_chunk_dict["completion_tokens"]
if usage_chunk_dict["cache_creation_input_tokens"] is not None and ( if usage_chunk_dict["cache_creation_input_tokens"] is not None:
usage_chunk_dict["cache_creation_input_tokens"] > 0
or cache_creation_input_tokens is None
):
cache_creation_input_tokens = usage_chunk_dict[ cache_creation_input_tokens = usage_chunk_dict[
"cache_creation_input_tokens" "cache_creation_input_tokens"
] ]
if usage_chunk_dict["cache_read_input_tokens"] is not None and ( if usage_chunk_dict["cache_read_input_tokens"] is not None:
usage_chunk_dict["cache_read_input_tokens"] > 0
or cache_read_input_tokens is None
):
cache_read_input_tokens = usage_chunk_dict[ cache_read_input_tokens = usage_chunk_dict[
"cache_read_input_tokens" "cache_read_input_tokens"
] ]

View file

@ -29,7 +29,6 @@ from litellm.types.llms.anthropic import (
UsageDelta, UsageDelta,
) )
from litellm.types.llms.openai import ( from litellm.types.llms.openai import (
ChatCompletionRedactedThinkingBlock,
ChatCompletionThinkingBlock, ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk, ChatCompletionToolCallChunk,
) )
@ -502,19 +501,18 @@ class ModelResponseIterator:
) -> Tuple[ ) -> Tuple[
str, str,
Optional[ChatCompletionToolCallChunk], Optional[ChatCompletionToolCallChunk],
List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]], List[ChatCompletionThinkingBlock],
Dict[str, Any], Dict[str, Any],
]: ]:
""" """
Helper function to handle the content block delta Helper function to handle the content block delta
""" """
text = "" text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None tool_use: Optional[ChatCompletionToolCallChunk] = None
provider_specific_fields = {} provider_specific_fields = {}
content_block = ContentBlockDelta(**chunk) # type: ignore content_block = ContentBlockDelta(**chunk) # type: ignore
thinking_blocks: List[ thinking_blocks: List[ChatCompletionThinkingBlock] = []
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
] = []
self.content_blocks.append(content_block) self.content_blocks.append(content_block)
if "text" in content_block["delta"]: if "text" in content_block["delta"]:
@ -543,25 +541,20 @@ class ModelResponseIterator:
) )
] ]
provider_specific_fields["thinking_blocks"] = thinking_blocks provider_specific_fields["thinking_blocks"] = thinking_blocks
return text, tool_use, thinking_blocks, provider_specific_fields return text, tool_use, thinking_blocks, provider_specific_fields
def _handle_reasoning_content( def _handle_reasoning_content(
self, self, thinking_blocks: List[ChatCompletionThinkingBlock]
thinking_blocks: List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
],
) -> Optional[str]: ) -> Optional[str]:
""" """
Handle the reasoning content Handle the reasoning content
""" """
reasoning_content = None reasoning_content = None
for block in thinking_blocks: for block in thinking_blocks:
thinking_content = cast(Optional[str], block.get("thinking"))
if reasoning_content is None: if reasoning_content is None:
reasoning_content = "" reasoning_content = ""
if thinking_content is not None: if "thinking" in block:
reasoning_content += thinking_content reasoning_content += block["thinking"]
return reasoning_content return reasoning_content
def chunk_parser(self, chunk: dict) -> ModelResponseStream: def chunk_parser(self, chunk: dict) -> ModelResponseStream:
@ -574,13 +567,7 @@ class ModelResponseIterator:
usage: Optional[Usage] = None usage: Optional[Usage] = None
provider_specific_fields: Dict[str, Any] = {} provider_specific_fields: Dict[str, Any] = {}
reasoning_content: Optional[str] = None reasoning_content: Optional[str] = None
thinking_blocks: Optional[ thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
index = int(chunk.get("index", 0)) index = int(chunk.get("index", 0))
if type_chunk == "content_block_delta": if type_chunk == "content_block_delta":
@ -618,15 +605,6 @@ class ModelResponseIterator:
}, },
"index": self.tool_index, "index": self.tool_index,
} }
elif (
content_block_start["content_block"]["type"] == "redacted_thinking"
):
thinking_blocks = [
ChatCompletionRedactedThinkingBlock(
type="redacted_thinking",
data=content_block_start["content_block"]["data"],
)
]
elif type_chunk == "content_block_stop": elif type_chunk == "content_block_stop":
ContentBlockStop(**chunk) # type: ignore ContentBlockStop(**chunk) # type: ignore
# check if tool call content block # check if tool call content block

View file

@ -7,9 +7,6 @@ import httpx
import litellm import litellm
from litellm.constants import ( from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
RESPONSE_FORMAT_TOOL_NAME, RESPONSE_FORMAT_TOOL_NAME,
) )
from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -30,7 +27,6 @@ from litellm.types.llms.openai import (
REASONING_EFFORT, REASONING_EFFORT,
AllMessageValues, AllMessageValues,
ChatCompletionCachedContent, ChatCompletionCachedContent,
ChatCompletionRedactedThinkingBlock,
ChatCompletionSystemMessage, ChatCompletionSystemMessage,
ChatCompletionThinkingBlock, ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk, ChatCompletionToolCallChunk,
@ -280,20 +276,11 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
if reasoning_effort is None: if reasoning_effort is None:
return None return None
elif reasoning_effort == "low": elif reasoning_effort == "low":
return AnthropicThinkingParam( return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
)
elif reasoning_effort == "medium": elif reasoning_effort == "medium":
return AnthropicThinkingParam( return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
elif reasoning_effort == "high": elif reasoning_effort == "high":
return AnthropicThinkingParam( return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
)
else: else:
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}") raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
@ -576,21 +563,13 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
) -> Tuple[ ) -> Tuple[
str, str,
Optional[List[Any]], Optional[List[Any]],
Optional[ Optional[List[ChatCompletionThinkingBlock]],
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
],
Optional[str], Optional[str],
List[ChatCompletionToolCallChunk], List[ChatCompletionToolCallChunk],
]: ]:
text_content = "" text_content = ""
citations: Optional[List[Any]] = None citations: Optional[List[Any]] = None
thinking_blocks: Optional[ thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
List[
Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
]
] = None
reasoning_content: Optional[str] = None reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = [] tool_calls: List[ChatCompletionToolCallChunk] = []
for idx, content in enumerate(completion_response["content"]): for idx, content in enumerate(completion_response["content"]):
@ -609,30 +588,20 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
index=idx, index=idx,
) )
) )
elif content.get("thinking", None) is not None:
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
elif content["type"] == "redacted_thinking":
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(
cast(ChatCompletionRedactedThinkingBlock, content)
)
## CITATIONS ## CITATIONS
if content.get("citations") is not None: if content.get("citations", None) is not None:
if citations is None: if citations is None:
citations = [] citations = []
citations.append(content["citations"]) citations.append(content["citations"])
if content.get("thinking", None) is not None:
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
if thinking_blocks is not None: if thinking_blocks is not None:
reasoning_content = "" reasoning_content = ""
for block in thinking_blocks: for block in thinking_blocks:
thinking_content = cast(Optional[str], block.get("thinking")) if "thinking" in block:
if thinking_content is not None: reasoning_content += block["thinking"]
reasoning_content += thinking_content
return text_content, citations, thinking_blocks, reasoning_content, tool_calls return text_content, citations, thinking_blocks, reasoning_content, tool_calls
def calculate_usage( def calculate_usage(
@ -722,13 +691,7 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
else: else:
text_content = "" text_content = ""
citations: Optional[List[Any]] = None citations: Optional[List[Any]] = None
thinking_blocks: Optional[ thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
List[
Union[
ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
]
]
] = None
reasoning_content: Optional[str] = None reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = [] tool_calls: List[ChatCompletionToolCallChunk] = []

View file

@ -43,9 +43,7 @@ class AnthropicMessagesHandler:
from litellm.proxy.pass_through_endpoints.success_handler import ( from litellm.proxy.pass_through_endpoints.success_handler import (
PassThroughEndpointLogging, PassThroughEndpointLogging,
) )
from litellm.types.passthrough_endpoints.pass_through_endpoints import ( from litellm.proxy.pass_through_endpoints.types import EndpointType
EndpointType,
)
# Create success handler object # Create success handler object
passthrough_success_handler_obj = PassThroughEndpointLogging() passthrough_success_handler_obj = PassThroughEndpointLogging()
@ -100,12 +98,12 @@ async def anthropic_messages(
api_base=optional_params.api_base, api_base=optional_params.api_base,
api_key=optional_params.api_key, api_key=optional_params.api_key,
) )
anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = ( anthropic_messages_provider_config: Optional[
ProviderConfigManager.get_provider_anthropic_messages_config( BaseAnthropicMessagesConfig
] = ProviderConfigManager.get_provider_anthropic_messages_config(
model=model, model=model,
provider=litellm.LlmProviders(_custom_llm_provider), provider=litellm.LlmProviders(_custom_llm_provider),
) )
)
if anthropic_messages_provider_config is None: if anthropic_messages_provider_config is None:
raise ValueError( raise ValueError(
f"Anthropic messages provider config not found for model: {model}" f"Anthropic messages provider config not found for model: {model}"

View file

@ -288,7 +288,6 @@ class AzureAssistantsAPI(BaseAzureLLM):
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
client=client, client=client,
litellm_params=litellm_params,
) )
thread_message: OpenAIMessage = openai_client.beta.threads.messages.create( # type: ignore thread_message: OpenAIMessage = openai_client.beta.threads.messages.create( # type: ignore

View file

@ -79,7 +79,7 @@ class AzureOpenAIO1Config(OpenAIOSeriesConfig):
return True return True
def is_o_series_model(self, model: str) -> bool: def is_o_series_model(self, model: str) -> bool:
return "o1" in model or "o3" in model or "o4" in model or "o_series/" in model return "o1" in model or "o3" in model or "o_series/" in model
def transform_request( def transform_request(
self, self,

View file

@ -1,172 +0,0 @@
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, cast
import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import *
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams
from litellm.utils import _add_path_to_api_base
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
LiteLLMLoggingObj = _LiteLLMLoggingObj
else:
LiteLLMLoggingObj = Any
class AzureOpenAIResponsesAPIConfig(OpenAIResponsesAPIConfig):
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
) -> dict:
api_key = (
api_key
or litellm.api_key
or litellm.azure_key
or get_secret_str("AZURE_OPENAI_API_KEY")
or get_secret_str("AZURE_API_KEY")
)
headers.update(
{
"Authorization": f"Bearer {api_key}",
}
)
return headers
def get_complete_url(
self,
api_base: Optional[str],
litellm_params: dict,
) -> str:
"""
Constructs a complete URL for the API request.
Args:
- api_base: Base URL, e.g.,
"https://litellm8397336933.openai.azure.com"
OR
"https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
- model: Model name.
- optional_params: Additional query parameters, including "api_version".
- stream: If streaming is required (optional).
Returns:
- A complete URL string, e.g.,
"https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
"""
api_base = api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")
if api_base is None:
raise ValueError(
f"api_base is required for Azure AI Studio. Please set the api_base parameter. Passed `api_base={api_base}`"
)
original_url = httpx.URL(api_base)
# Extract api_version or use default
api_version = cast(Optional[str], litellm_params.get("api_version"))
# Create a new dictionary with existing params
query_params = dict(original_url.params)
# Add api_version if needed
if "api-version" not in query_params and api_version:
query_params["api-version"] = api_version
# Add the path to the base URL
if "/openai/responses" not in api_base:
new_url = _add_path_to_api_base(
api_base=api_base, ending_path="/openai/responses"
)
else:
new_url = api_base
# Use the new query_params dictionary
final_url = httpx.URL(new_url).copy_with(params=query_params)
return str(final_url)
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
def _construct_url_for_response_id_in_path(
self, api_base: str, response_id: str
) -> str:
"""
Constructs a URL for the API request with the response_id in the path.
"""
from urllib.parse import urlparse, urlunparse
# Parse the URL to separate its components
parsed_url = urlparse(api_base)
# Insert the response_id at the end of the path component
# Remove trailing slash if present to avoid double slashes
path = parsed_url.path.rstrip("/")
new_path = f"{path}/{response_id}"
# Reconstruct the URL with all original components but with the modified path
constructed_url = urlunparse(
(
parsed_url.scheme, # http, https
parsed_url.netloc, # domain name, port
new_path, # path with response_id added
parsed_url.params, # parameters
parsed_url.query, # query string
parsed_url.fragment, # fragment
)
)
return constructed_url
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
"""
Transform the delete response API request into a URL and data
Azure OpenAI API expects the following request:
- DELETE /openai/responses/{response_id}?api-version=xxx
This function handles URLs with query parameters by inserting the response_id
at the correct location (before any query parameters).
"""
delete_url = self._construct_url_for_response_id_in_path(
api_base=api_base, response_id=response_id
)
data: Dict = {}
verbose_logger.debug(f"delete response url={delete_url}")
return delete_url, data
#########################################################
########## GET RESPONSE API TRANSFORMATION ###############
#########################################################
def transform_get_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
"""
Transform the get response API request into a URL and data
OpenAI API expects the following request
- GET /v1/responses/{response_id}
"""
get_url = self._construct_url_for_response_id_in_path(
api_base=api_base, response_id=response_id
)
data: Dict = {}
verbose_logger.debug(f"get response url={get_url}")
return get_url, data

View file

@ -1,4 +1,3 @@
import enum
from typing import Any, List, Optional, Tuple, cast from typing import Any, List, Optional, Tuple, cast
from urllib.parse import urlparse from urllib.parse import urlparse
@ -20,10 +19,6 @@ from litellm.types.utils import ModelResponse, ProviderField
from litellm.utils import _add_path_to_api_base, supports_tool_choice from litellm.utils import _add_path_to_api_base, supports_tool_choice
class AzureFoundryErrorStrings(str, enum.Enum):
SET_EXTRA_PARAMETERS_TO_PASS_THROUGH = "Set extra-parameters to 'pass-through'"
class AzureAIStudioConfig(OpenAIConfig): class AzureAIStudioConfig(OpenAIConfig):
def get_supported_openai_params(self, model: str) -> List: def get_supported_openai_params(self, model: str) -> List:
model_supports_tool_choice = True # azure ai supports this by default model_supports_tool_choice = True # azure ai supports this by default
@ -245,18 +240,12 @@ class AzureAIStudioConfig(OpenAIConfig):
) -> bool: ) -> bool:
should_drop_params = litellm_params.get("drop_params") or litellm.drop_params should_drop_params = litellm_params.get("drop_params") or litellm.drop_params
error_text = e.response.text error_text = e.response.text
if should_drop_params and "Extra inputs are not permitted" in error_text: if should_drop_params and "Extra inputs are not permitted" in error_text:
return True return True
elif ( elif (
"unknown field: parameter index is not a valid field" in error_text "unknown field: parameter index is not a valid field" in error_text
): # remove index from tool calls ): # remove index from tool calls
return True return True
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in error_text
): # remove extra-parameters from tool calls
return True
return super().should_retry_llm_api_inside_llm_translation_on_http_error( return super().should_retry_llm_api_inside_llm_translation_on_http_error(
e=e, litellm_params=litellm_params e=e, litellm_params=litellm_params
) )
@ -276,46 +265,5 @@ class AzureAIStudioConfig(OpenAIConfig):
litellm.remove_index_from_tool_calls( litellm.remove_index_from_tool_calls(
messages=_messages, messages=_messages,
) )
elif (
AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
in e.response.text
):
request_data = self._drop_extra_params_from_request_data(
request_data, e.response.text
)
data = drop_params_from_unprocessable_entity_error(e=e, data=request_data) data = drop_params_from_unprocessable_entity_error(e=e, data=request_data)
return data return data
def _drop_extra_params_from_request_data(
self, request_data: dict, error_text: str
) -> dict:
params_to_drop = self._extract_params_to_drop_from_error_text(error_text)
if params_to_drop:
for param in params_to_drop:
if param in request_data:
request_data.pop(param, None)
return request_data
def _extract_params_to_drop_from_error_text(
self, error_text: str
) -> Optional[List[str]]:
"""
Error text looks like this"
"Extra parameters ['stream_options', 'extra-parameters'] are not allowed when extra-parameters is not set or set to be 'error'.
"""
import re
# Extract parameters within square brackets
match = re.search(r"\[(.*?)\]", error_text)
if not match:
return []
# Parse the extracted string into a list of parameter names
params_str = match.group(1)
params = []
for param in params_str.split(","):
# Clean up the parameter name (remove quotes, spaces)
clean_param = param.strip().strip("'").strip('"')
if clean_param:
params.append(clean_param)
return params

View file

@ -1,6 +1,6 @@
import types import types
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, Optional, Union
import httpx import httpx
@ -10,7 +10,6 @@ from litellm.types.llms.openai import (
ResponsesAPIResponse, ResponsesAPIResponse,
ResponsesAPIStreamingResponse, ResponsesAPIStreamingResponse,
) )
from litellm.types.responses.main import *
from litellm.types.router import GenericLiteLLMParams from litellm.types.router import GenericLiteLLMParams
if TYPE_CHECKING: if TYPE_CHECKING:
@ -74,7 +73,8 @@ class BaseResponsesAPIConfig(ABC):
def get_complete_url( def get_complete_url(
self, self,
api_base: Optional[str], api_base: Optional[str],
litellm_params: dict, model: str,
stream: Optional[bool] = None,
) -> str: ) -> str:
""" """
OPTIONAL OPTIONAL
@ -119,56 +119,6 @@ class BaseResponsesAPIConfig(ABC):
""" """
pass pass
#########################################################
########## DELETE RESPONSE API TRANSFORMATION ##############
#########################################################
@abstractmethod
def transform_delete_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
pass
@abstractmethod
def transform_delete_response_api_response(
self,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> DeleteResponseResult:
pass
#########################################################
########## END DELETE RESPONSE API TRANSFORMATION #######
#########################################################
#########################################################
########## GET RESPONSE API TRANSFORMATION ###############
#########################################################
@abstractmethod
def transform_get_response_api_request(
self,
response_id: str,
api_base: str,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Tuple[str, Dict]:
pass
@abstractmethod
def transform_get_response_api_response(
self,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> ResponsesAPIResponse:
pass
#########################################################
########## END GET RESPONSE API TRANSFORMATION ##########
#########################################################
def get_error_class( def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException: ) -> BaseLLMException:

Some files were not shown because too many files have changed in this diff Show more