Merge pull request #4 from BerriAI/main

Update main
This commit is contained in:
vivek-athina 2025-03-10 11:13:21 +05:30 committed by GitHub
commit cd4a53d6f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
315 changed files with 13077 additions and 3384 deletions

View file

@ -1935,12 +1935,12 @@ jobs:
pip install prisma
pip install fastapi
pip install jsonschema
pip install "httpx==0.24.1"
pip install "httpx==0.27.0"
pip install "anyio==3.7.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
pip install "google-cloud-aiplatform==1.59.0"
pip install "anthropic==0.21.3"
pip install "anthropic==0.49.0"
# Run pytest and generate JUnit XML report
- run:
name: Build Docker image
@ -1982,11 +1982,44 @@ jobs:
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
# Add Ruby installation and testing before the existing Node.js and Python tests
- run:
name: Install Ruby and Bundler
command: |
# Import GPG keys first
gpg --keyserver hkp://keyserver.ubuntu.com --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3 7D2BAF1CF37B13E2069D6956105BD0E739499BDB || {
curl -sSL https://rvm.io/mpapis.asc | gpg --import -
curl -sSL https://rvm.io/pkuczynski.asc | gpg --import -
}
# Install Ruby version manager (RVM)
curl -sSL https://get.rvm.io | bash -s stable
# Source RVM from the correct location
source $HOME/.rvm/scripts/rvm
# Install Ruby 3.2.2
rvm install 3.2.2
rvm use 3.2.2 --default
# Install latest Bundler
gem install bundler
- run:
name: Run Ruby tests
command: |
source $HOME/.rvm/scripts/rvm
cd tests/pass_through_tests/ruby_passthrough_tests
bundle install
bundle exec rspec
no_output_timeout: 30m
# New steps to run Node.js test
- run:
name: Install Node.js
command: |
export DEBIAN_FRONTEND=noninteractive
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get update
sudo apt-get install -y nodejs
node --version
npm --version

View file

@ -8,7 +8,7 @@ class MyUser(HttpUser):
def chat_completion(self):
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer sk-ZoHqrLIs2-5PzJrqBaviAA",
"Authorization": "Bearer sk-8N1tLOOyH8TIxwOLahhIVg",
# Include any additional headers you may need for authentication, etc.
}

2
.gitignore vendored
View file

@ -77,3 +77,5 @@ litellm/proxy/_experimental/out/404.html
litellm/proxy/_experimental/out/model_hub.html
.mypy_cache/*
litellm/proxy/application.log
tests/llm_translation/vertex_test_account.json
tests/llm_translation/test_vertex_key.json

View file

@ -40,7 +40,7 @@ LiteLLM manages:
[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published.
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle)
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

View file

@ -18,7 +18,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.0
version: 0.4.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

View file

@ -48,6 +48,23 @@ spec:
{{- end }}
- name: DISABLE_SCHEMA_UPDATE
value: "false" # always run the migration from the Helm PreSync hook, override the value set
{{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
restartPolicy: OnFailure
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }}
backoffLimit: {{ .Values.migrationJob.backoffLimit }}
{{- end }}

View file

@ -187,6 +187,7 @@ migrationJob:
backoffLimit: 4 # Backoff limit for Job restarts
disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0.
annotations: {}
ttlSecondsAfterFinished: 120
# Additional environment variables to be added to the deployment
envVars: {

View file

@ -0,0 +1,92 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] `/v1/messages`
LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint.
This currently just supports the Anthropic API.
| Feature | Supported | Notes |
|-------|-------|-------|
| Cost Tracking | ✅ | |
| Logging | ✅ | works across all integrations |
| End-user Tracking | ✅ | |
| Streaming | ✅ | |
| Fallbacks | ✅ | between anthropic models |
| Loadbalancing | ✅ | between anthropic models |
Planned improvement:
- Vertex AI Anthropic support
- Bedrock Anthropic support
## Usage
<Tabs>
<TabItem label="PROXY" value="proxy">
1. Setup config.yaml
```yaml
model_list:
- model_name: anthropic-claude
litellm_params:
model: claude-3-7-sonnet-latest
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
-H 'content-type: application/json' \
-H 'x-api-key: $LITELLM_API_KEY' \
-H 'anthropic-version: 2023-06-01' \
-d '{
"model": "anthropic-claude",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "List 5 important events in the XIX century"
}
]
}
],
"max_tokens": 4096
}'
```
</TabItem>
<TabItem value="sdk" label="SDK">
```python
from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
import asyncio
import os
# set env
os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
# Call the handler
async def call():
response = await anthropic_messages(
messages=messages,
api_key=api_key,
model="claude-3-haiku-20240307",
max_tokens=100,
)
asyncio.run(call())
```
</TabItem>
</Tabs>

View file

@ -189,4 +189,138 @@ Expected Response
```
</TabItem>
</Tabs>
</Tabs>
## Explicitly specify image type
If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param.
```python
"image_url": {
"url": "gs://my-gs-image",
"format": "image/jpeg"
}
```
LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai).
For others (e.g. openai), it will be ignored.
<Tabs>
<TabItem label="SDK" value="sdk">
```python
import os
from litellm import completion
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "claude-3-7-sonnet-latest",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
<TabItem label="PROXY" value="proxy">
1. Define vision models on config.yaml
```yaml
model_list:
- model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
litellm_params:
model: openai/gpt-4-vision-preview
api_key: os.environ/OPENAI_API_KEY
- model_name: llava-hf # Custom OpenAI compatible model
litellm_params:
model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
api_base: http://localhost:8000
api_key: fake-key
model_info:
supports_vision: True # set supports_vision to True so /model/info returns this attribute as True
```
2. Run proxy server
```bash
litellm --config config.yaml
```
3. Test it using the OpenAI Python SDK
```python
import os
from openai import OpenAI
client = OpenAI(
api_key="sk-1234", # your litellm proxy api key
)
response = client.chat.completions.create(
model = "gpt-4-vision-preview", # use model="llava-hf" to test your custom OpenAI endpoint
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"format": "image/jpeg"
}
}
]
}
],
)
```
</TabItem>
</Tabs>
## Spec
```
"image_url": str
OR
"image_url": {
"url": "url OR base64 encoded str",
"detail": "openai-only param",
"format": "specify mime-type of image"
}
```

View file

@ -46,7 +46,7 @@ For security inquiries, please contact us at support@berri.ai
|-------------------|-------------------------------------------------------------------------------------------------|
| SOC 2 Type I | Certified. Report available upon request on Enterprise plan. |
| SOC 2 Type II | In progress. Certificate available by April 15th, 2025 |
| ISO27001 | In progress. Certificate available by February 7th, 2025 |
| ISO 27001 | Certified. Report available upon request on Enterprise |
## Supported Data Regions for LiteLLM Cloud
@ -137,7 +137,7 @@ Point of contact email address for general security-related questions: krrish@be
Has the Vendor been audited / certified?
- SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
- SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
- ISO27001. In progress. Certificate available by February 7th, 2025.
- ISO 27001. Certified. Report available upon request on Enterprise plan.
Has an information security management system been implemented?
- Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.

View file

@ -0,0 +1,5 @@
PDL - A YAML-based approach to prompt programming
Github: https://github.com/IBM/prompt-declaration-language
PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.

View file

@ -0,0 +1,9 @@
# pgai
[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
If you're already familiar with pgai, you can find litellm specific docs here:
- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.

View file

@ -286,9 +286,12 @@ print(response)
</TabItem>
</Tabs>
## Usage - Function Calling
## Usage - Function Calling / Tool calling
LiteLLM uses Bedrock's Converse API for making tool calls
LiteLLM supports tool calling via Bedrock's Converse and Invoke API's.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
@ -333,6 +336,69 @@ assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # for bedrock invoke, specify `bedrock/invoke/<model>`
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_API_KEY" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [
{
"role": "user",
"content": "What'\''s the weather like in Boston today?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto"
}'
```
</TabItem>
</Tabs>
## Usage - Vision
@ -377,6 +443,226 @@ print(f"\nResponse: {resp}")
```
## Usage - 'thinking' / 'reasoning content'
This is currently only supported for Anthropic's Claude 3.7 Sonnet + Deepseek R1.
Works on v1.61.20+.
Returns 2 new fields in `message` and `delta` object:
- `reasoning_content` - string - The reasoning content of the response
- `thinking_blocks` - list of objects (Anthropic only) - The thinking blocks of the response
Each object has the following fields:
- `type` - Literal["thinking"] - The type of thinking block
- `thinking` - string - The thinking of the response. Also returned in `reasoning_content`
- `signature` - string - A base64 encoded string, returned by Anthropic.
The `signature` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
resp = completion(
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
print(resp)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON REQUEST
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON CONFIG.YAML
}'
```
</TabItem>
</Tabs>
**Expected Response**
Same as [Anthropic API response](../providers/anthropic#usage---thinking--reasoning_content).
```python
{
"id": "chatcmpl-c661dfd7-7530-49c9-b0cc-d5018ba4727d",
"created": 1740640366,
"model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
"object": "chat.completion",
"system_fingerprint": null,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "The capital of France is Paris. It's not only the capital city but also the largest city in France, serving as the country's major cultural, economic, and political center.",
"role": "assistant",
"tool_calls": null,
"function_call": null,
"reasoning_content": "The capital of France is Paris. This is a straightforward factual question.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris. This is a straightforward factual question.",
"signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+yCHpBY7U6FQW8/FcoLewocJQPa2HnmLM+NECy50y44F/kD4SULFXi57buI9fAvyBwtyjlOiO0SDE3+r3spdg6PLOo9PBoMma2ku5OTAoR46j9VIjDRlvNmBvff7YW4WI9oU8XagaOBSxLPxElrhyuxppEn7m6bfT40dqBSTDrfiw4FYB4qEPETTI6TA6wtjGAAqmFqKTo="
}
]
}
}
],
"usage": {
"completion_tokens": 64,
"prompt_tokens": 42,
"total_tokens": 106,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
```
## Usage - Structured Output / JSON mode
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
from pydantic import BaseModel
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
class EventsList(BaseModel):
events: list[CalendarEvent]
response = completion(
model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", # specify invoke via `bedrock/invoke/anthropic.claude-3-7-sonnet-20250219-v1:0`
response_format=EventsList,
messages=[
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
{"role": "user", "content": "Who won the world series in 2020?"}
],
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: bedrock-claude-3-7
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # specify invoke via `bedrock/invoke/<model_name>`
aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "bedrock-claude-3-7",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant designed to output JSON."
},
{
"role": "user",
"content": "Who won the worlde series in 2020?"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"description": "reason about maths",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": false
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": false
},
"strict": true
}
}
}'
```
</TabItem>
</Tabs>
## Usage - Bedrock Guardrails
Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)

View file

@ -23,14 +23,16 @@ import os
os.environ['CEREBRAS_API_KEY'] = ""
response = completion(
model="cerebras/meta/llama3-70b-instruct",
model="cerebras/llama3-70b-instruct",
messages=[
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
"content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
}
],
max_tokens=10,
# The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
@ -50,16 +52,18 @@ import os
os.environ['CEREBRAS_API_KEY'] = ""
response = completion(
model="cerebras/meta/llama3-70b-instruct",
model="cerebras/llama3-70b-instruct",
messages=[
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
"content": "What's the weather like in Boston today in Fahrenheit? (Write in JSON)",
}
],
stream=True,
max_tokens=10,
response_format={ "type": "json_object" },
# The prompt should include JSON if 'json_object' is selected; otherwise, you will get error code 400.
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
temperature=0.2,

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Infinity
| Property | Details |
@ -12,6 +15,9 @@
```python
from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank(
model="infinity/rerank",
@ -65,3 +71,114 @@ curl http://0.0.0.0:4000/rerank \
```
## Supported Cohere Rerank API Params
| Param | Type | Description |
|-------|-------|-------|
| `query` | `str` | The query to rerank the documents against |
| `documents` | `list[str]` | The documents to rerank |
| `top_n` | `int` | The number of documents to return |
| `return_documents` | `bool` | Whether to return the documents in the response |
### Usage - Return Documents
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
return_documents=True,
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of France?",
"documents": [
"Paris",
"London",
"Berlin",
"Madrid"
],
"return_documents": True,
}'
```
</TabItem>
</Tabs>
## Pass Provider-specific Params
Any unmapped params will be passed to the provider as-is.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import rerank
import os
os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
response = rerank(
model="infinity/rerank",
query="What is the capital of France?",
documents=["Paris", "London", "Berlin", "Madrid"],
raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: custom-infinity-rerank
litellm_params:
model: infinity/rerank
api_base: https://localhost:8080
raw_scores: True # 👈 EITHER SET PROVIDER-SPECIFIC PARAMS HERE OR IN REQUEST BODY
```
2. Start litellm
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl http://0.0.0.0:4000/rerank \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "custom-infinity-rerank",
"query": "What is the capital of the United States?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. is the capital of the United States.",
"Capital punishment has existed in the United States since before it was a country."
],
"raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
}'
```
</TabItem>
</Tabs>

View file

@ -2,11 +2,11 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Sambanova
https://community.sambanova.ai/t/create-chat-completion-api/
https://cloud.sambanova.ai/
:::tip
**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://sambanova.ai/technology/models **
**We support ALL Sambanova models, just set `model=sambanova/<any-model-on-sambanova>` as a prefix when sending litellm requests. For the complete supported model list, visit https://docs.sambanova.ai/cloud/docs/get-started/supported-models **
:::
@ -27,12 +27,11 @@ response = completion(
messages=[
{
"role": "user",
"content": "What do you know about sambanova.ai",
"content": "What do you know about sambanova.ai. Give your response in json format",
}
],
max_tokens=10,
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
temperature=0.2,
top_p=0.9,
@ -54,13 +53,12 @@ response = completion(
messages=[
{
"role": "user",
"content": "What do you know about sambanova.ai",
"content": "What do you know about sambanova.ai. Give your response in json format",
}
],
stream=True,
max_tokens=10,
response_format={ "type": "json_object" },
seed=123,
stop=["\n\n"],
temperature=0.2,
top_p=0.9,

View file

@ -852,6 +852,7 @@ litellm.vertex_location = "us-central1 # Your Location
| claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
| claude-3-7-sonnet@20250219 | `completion('vertex_ai/claude-3-7-sonnet@20250219', messages)` |
### Usage
@ -926,6 +927,119 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</Tabs>
### Usage - `thinking` / `reasoning_content`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
resp = completion(
model="vertex_ai/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: vertex_ai/claude-3-7-sonnet-20250219
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
-d '{
"model": "claude-3-7-sonnet-20250219",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```python
ModelResponse(
id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
created=1740470510,
model='claude-3-7-sonnet-20250219',
object='chat.completion',
system_fingerprint=None,
choices=[
Choices(
finish_reason='stop',
index=0,
message=Message(
content="The capital of France is Paris.",
role='assistant',
tool_calls=None,
function_call=None,
provider_specific_fields={
'citations': None,
'thinking_blocks': [
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6...'
}
]
}
),
thinking_blocks=[
{
'type': 'thinking',
'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
'signature': 'EuYBCkQYAiJAy6AGB...'
}
],
reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
)
],
usage=Usage(
completion_tokens=68,
prompt_tokens=42,
total_tokens=110,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None,
cached_tokens=0,
text_tokens=None,
image_tokens=None
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0
)
)
```
## Llama 3 API
| Model Name | Function Call |
@ -1572,6 +1686,14 @@ assert isinstance(
Pass any file supported by Vertex AI, through LiteLLM.
LiteLLM Supports the following image types passed in url
```
Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
Base64 Encoded Local Images
```
<Tabs>
<TabItem value="sdk" label="SDK">

View file

@ -157,6 +157,98 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
</TabItem>
</Tabs>
## Send Video URL to VLLM
Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)
There are two ways to send a video url to VLLM:
1. Pass the video url directly
```
{"type": "video_url", "video_url": {"url": video_url}},
```
2. Pass the video data as base64
```
{"type": "video_url", "video_url": {"url": f"data:video/mp4;base64,{video_data_base64}"}}
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
response = completion(
model="hosted_vllm/qwen", # pass the vllm model name
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Summarize the following video"
},
{
"type": "video_url",
"video_url": {
"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
}
}
]
}
],
api_base="https://hosted-vllm-api.co")
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: my-model
litellm_params:
model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider
api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST http://0.0.0.0:4000/chat/completions \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"model": "my-model",
"messages": [
{"role": "user", "content":
[
{"type": "text", "text": "Summarize the following video"},
{"type": "video_url", "video_url": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
]
}
]
}'
```
</TabItem>
</Tabs>
## (Deprecated) for `vllm pip package`
### Using - `litellm.completion`

View file

@ -36,7 +36,7 @@ import TabItem from '@theme/TabItem';
- Virtual Key Rate Limit
- User Rate Limit
- Team Limit
- The `_PROXY_track_cost_callback` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
- The `_ProxyDBLogger` updates spend / usage in the LiteLLM database. [Here is everything tracked in the DB per request](https://github.com/BerriAI/litellm/blob/ba41a72f92a9abf1d659a87ec880e8e319f87481/schema.prisma#L172)
## Frequently Asked Questions

View file

@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma
| Table Name | Description | Row Insert Frequency |
|------------|-------------|---------------------|
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |
## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
## Disable `LiteLLM_SpendLogs`
You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
```yaml
general_settings:
disable_spend_logs: True # Disable writing spend logs to DB
disable_error_logs: True # Disable writing error logs to DB
disable_error_logs: True # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
```
### What is the impact of disabling these logs?

View file

@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
| `api_base` | `Optional[str]` | Optional API base URL |
| `response_cost` | `Optional[str]` | Optional response cost |
| `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |
## StandardLoggingModelInformation

View file

@ -0,0 +1,53 @@
# Rotating Master Key
Here are our recommended steps for rotating your master key.
**1. Backup your DB**
In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
**2. Call `/key/regenerate` with the new master key**
```bash
curl -L -X POST 'http://localhost:4000/key/regenerate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"key": "sk-1234",
"new_master_key": "sk-PIp1h0RekR"
}'
```
This will re-encrypt any models in your Proxy_ModelTable with the new master key.
Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
```bash
raise Exception("Unable to decrypt value={}".format(v))
Exception: Unable to decrypt value=<new-encrypted-value>
```
**3. Update LITELLM_MASTER_KEY**
In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
This ensures the key used for decryption from db is the new key.
**4. Test it**
Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
"messages": [
{
"content": "Hey, how's it going",
"role": "user"
}
],
}'
```

View file

@ -107,9 +107,9 @@ general_settings:
By default, LiteLLM writes several types of logs to the database:
- Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
- LLM Exceptions to the `LiteLLM_SpendLogs` table
If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:
```yaml
general_settings:

View file

@ -0,0 +1,12 @@
# Release Cycle
Litellm Proxy has the following release cycle:
- `v1.x.x-nightly`: These are releases which pass ci/cd.
- `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.
In production, we recommend using the latest `v1.x.x` release.
Follow our release notes [here](https://github.com/BerriAI/litellm/releases).

View file

@ -0,0 +1,357 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 'Thinking' / 'Reasoning Content'
Supported Providers:
- Deepseek (`deepseek/`)
- Anthropic API (`anthropic/`)
- Bedrock (Anthropic + Deepseek) (`bedrock/`)
- Vertex AI (Anthropic) (`vertexai/`)
```python
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris.",
"signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
}
]
}
```
## Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["ANTHROPIC_API_KEY"] = ""
response = completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[
{"role": "user", "content": "What is the capital of France?"},
],
thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`)
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "anthropic/claude-3-7-sonnet-20250219",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"thinking": {"type": "enabled", "budget_tokens": 1024}
}'
```
</TabItem>
</Tabs>
**Expected Response**
```bash
{
"id": "3b66124d79a708e10c603496b363574c",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": " won the FIFA World Cup in 2022.",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1723323084,
"model": "deepseek/deepseek-chat",
"object": "chat.completion",
"system_fingerprint": "fp_7e0991cad4",
"usage": {
"completion_tokens": 12,
"prompt_tokens": 16,
"total_tokens": 28,
},
"service_tier": null
}
```
## Tool Calling with `thinking`
Here's how to use `thinking` blocks by Anthropic with tool calling.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
litellm._turn_on_debug()
litellm.modify_params = True
model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "user",
"content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses",
}
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model=model,
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
thinking={"type": "enabled", "budget_tokens": 1024},
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
print("Expecting there to be 3 tool calls")
assert (
len(tool_calls) > 0
) # this has to call the function for SF, Tokyo and paris
# Step 2: check if the model wanted to call a function
print(f"tool_calls: {tool_calls}")
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
} # only one function in this example, but you can have multiple
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
if function_name not in available_functions:
# the model called a function that does not exist in available_functions - don't try calling anything
return
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model=model,
messages=messages,
seed=22,
# tools=tools,
drop_params=True,
thinking={"type": "enabled", "budget_tokens": 1024},
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-7-sonnet-thinking
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
thinking: {
"type": "enabled",
"budget_tokens": 1024
}
```
2. Run proxy
```bash
litellm --config config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Make 1st call
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"},
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
],
"tool_choice": "auto"
}'
```
4. Make 2nd call with tool call results
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "claude-3-7-sonnet-thinking",
"messages": [
{
"role": "user",
"content": "What\'s the weather like in San Francisco, Tokyo, and Paris? - give me 3 responses"
},
{
"role": "assistant",
"content": "I\'ll check the current weather for these three cities for you:",
"tool_calls": [
{
"index": 2,
"function": {
"arguments": "{\"location\": \"San Francisco\"}",
"name": "get_current_weather"
},
"id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"type": "function"
}
],
"function_call": null,
"reasoning_content": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
}
],
"provider_specific_fields": {
"reasoningContentBlocks": [
{
"reasoningText": {
"signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c=",
"text": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user."
}
}
]
}
},
{
"tool_call_id": "tooluse_mnqzmtWYRjCxUInuAdK7-w",
"role": "tool",
"name": "get_current_weather",
"content": "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}"
}
]
}'
```
</TabItem>
</Tabs>
## Switching between Anthropic + Deepseek models
Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).
```python
litellm.drop_params = True # 👈 EITHER GLOBALLY or per request
# or per request
## Anthropic
response = litellm.completion(
model="anthropic/claude-3-7-sonnet-20250219",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
## Deepseek
response = litellm.completion(
model="deepseek/deepseek-chat",
messages=[{"role": "user", "content": "What is the capital of France?"}],
thinking={"type": "enabled", "budget_tokens": 1024},
drop_params=True,
)
```
## Spec
These fields can be accessed via `response.choices[0].message.reasoning_content` and `response.choices[0].message.thinking_blocks`.
- `reasoning_content` - str: The reasoning content from the model. Returned across all providers.
- `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
- `type` - str: The type of thinking block.
- `thinking` - str: The thinking from the model.
- `signature` - str: The signature delta from the model.

View file

@ -952,8 +952,8 @@ router_settings:
```
Defaults:
- allowed_fails: 0
- cooldown_time: 60s
- allowed_fails: 3
- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)
**Set Per Model**

View file

@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
```
### Using K/V pairs in 1 AWS Secret
You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
```yaml
general_settings:
key_management_system: "aws_secret_manager"
key_management_settings:
hosted_keys: [
"OPENAI_API_KEY_MODEL_1",
"OPENAI_API_KEY_MODEL_2",
]
primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
```
The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
```json
{
"OPENAI_API_KEY_MODEL_1": "sk-key1...",
"OPENAI_API_KEY_MODEL_2": "sk-key2..."
}
```
This reduces the number of AWS Secrets you need to manage.
## Hashicorp Vault
@ -353,4 +380,7 @@ general_settings:
# Hosted Keys Settings
hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
# K/V pairs in 1 AWS Secret Settings
primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
```

View file

@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Use LiteLLM AI Gateway with Aporia Guardrails
# Aporia Guardrails with LiteLLM Gateway
In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses
## 1. Setup guardrails on Aporia

View file

@ -0,0 +1,109 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenWeb UI with LiteLLM
This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to
- Access 100+ LLMs on OpenWeb UI
- Track Spend / Usage, Set Budget Limits
- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
- Set access controls eg. Control what models OpenWebUI can access.
## Quickstart
- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
## 1. Start LiteLLM & OpenWebUI
Deploy this docker compose to deploy both OpenWebUI and LiteLLM.
```bash
docker compose up -d
```
- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
## 2. Create a Virtual Key on LiteLLM
Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
### 2.1 LiteLLM User Management Hierarchy
On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
### 2.2 Create a Team on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
<Image img={require('../../img/litellm_create_team.gif')} />
### 2.2 Create a Virtual Key on LiteLLM
Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key.
LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
<Image img={require('../../img/create_key_in_team_oweb.gif')} />
## 3. Connect OpenWeb UI to LiteLLM
On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
Enter the following details:
- URL: `http://localhost:4000` (your litellm proxy base url)
- Key: `your-virtual-key` (the key you created in the previous step)
<Image img={require('../../img/litellm_setup_openweb.gif')} />
### 3.1 Test Request
On the top left corner, select models you should only see the models you gave the key access to in Step 2.
Once you selected a model, enter your message content and click on `Submit`
<Image img={require('../../img/basic_litellm.gif')} />
### 3.2 Tracking Spend / Usage
After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
## Render `thinking` content on OpenWeb UI
OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
Example litellm config.yaml:
```yaml
model_list:
- model_name: thinking-anthropic-claude-3-7-sonnet
litellm_params:
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024}
max_tokens: 1080
merge_reasoning_content_in_choices: true
```
### Test it on OpenWeb UI
On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
<Image img={require('../../img/litellm_thinking_openweb.gif')} />

View file

@ -44,7 +44,7 @@ const config = {
path: './release_notes',
routeBasePath: 'release_notes',
blogTitle: 'Release Notes',
blogSidebarTitle: 'All Releases',
blogSidebarTitle: 'Releases',
blogSidebarCount: 'ALL',
postsPerPage: 'ALL',
showReadingTime: false,

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 470 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 918 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 386 KiB

View file

@ -18,13 +18,6 @@ hide_table_of_contents: false
`alerting`, `prometheus`, `secret management`, `management endpoints`, `ui`, `prompt management`, `finetuning`, `batch`
:::note
v1.57.8-stable, is currently being tested. It will be released on 2025-01-12.
:::
## New / Updated Models
1. Mistral large pricing - https://github.com/BerriAI/litellm/pull/7452

View file

@ -0,0 +1,103 @@
---
title: v1.61.20-stable
slug: v1.61.20-stable
date: 2025-03-01T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, rerank, ui, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
# v1.61.20-stable
These are the changes since `v1.61.13-stable`.
This release is primarily focused on:
- LLM Translation improvements (claude-3-7-sonnet + 'thinking'/'reasoning_content' support)
- UI improvements (add model flow, user management, etc)
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
1. Anthropic 3-7 sonnet support + cost tracking (Anthropic API + Bedrock + Vertex AI + OpenRouter)
1. Anthropic API [Start here](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content)
2. Bedrock API [Start here](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
3. Vertex AI API [See here](../../docs/providers/vertex#usage---thinking--reasoning_content)
4. OpenRouter [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L5626)
2. Gpt-4.5-preview support + cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L79)
3. Azure AI - Phi-4 cost tracking [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L1773)
4. Claude-3.5-sonnet - vision support updated on Anthropic API [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2888)
5. Bedrock llama vision support [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L7714)
6. Cerebras llama3.3-70b pricing [See here](https://github.com/BerriAI/litellm/blob/ba5bdce50a0b9bc822de58c03940354f19a733ed/model_prices_and_context_window.json#L2697)
## LLM Translation
1. Infinity Rerank - support returning documents when return_documents=True [Start here](../../docs/providers/infinity#usage---returning-documents)
2. Amazon Deepseek - `<think>` param extraction into reasoning_content [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-imported-models-deepseek-deepseek-r1)
3. Amazon Titan Embeddings - filter out aws_ params from request body [Start here](https://docs.litellm.ai/docs/providers/bedrock#bedrock-embedding)
4. Anthropic thinking + reasoning_content translation support (Anthropic API, Bedrock, Vertex AI) [Start here](https://docs.litellm.ai/docs/reasoning_content)
5. VLLM - support video_url [Start here](../../docs/providers/vllm#send-video-url-to-vllm)
6. Call proxy via litellm SDK: Support `litellm_proxy/` for embedding, image_generation, transcription, speech, rerank [Start here](https://docs.litellm.ai/docs/providers/litellm_proxy)
7. OpenAI Pass-through - allow using Assistants GET, DELETE on /openai pass through routes [Start here](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
8. Message Translation - fix openai message for assistant msg if role is missing - openai allows this
9. O1/O3 - support drop_params for o3-mini and o1 parallel_tool_calls param (not supported currently) [See here](https://docs.litellm.ai/docs/completion/drop_params)
## Spend Tracking Improvements
1. Cost tracking for rerank via Bedrock [See PR](https://github.com/BerriAI/litellm/commit/b682dc4ec8fd07acf2f4c981d2721e36ae2a49c5)
2. Anthropic pass-through - fix race condition causing cost to not be tracked [See PR](https://github.com/BerriAI/litellm/pull/8874)
3. Anthropic pass-through: Ensure accurate token counting [See PR](https://github.com/BerriAI/litellm/pull/8880)
## Management Endpoints / UI
1. Models Page - Allow sorting models by created at
2. Models Page - Edit Model Flow Improvements
3. Models Page - Fix Adding Azure, Azure AI Studio models on UI
4. Internal Users Page - Allow Bulk Adding Internal Users on UI
5. Internal Users Page - Allow sorting users by created at
6. Virtual Keys Page - Allow searching for UserIDs on the dropdown when assigning a user to a team [See PR](https://github.com/BerriAI/litellm/pull/8844)
7. Virtual Keys Page - allow creating a user when assigning keys to users [See PR](https://github.com/BerriAI/litellm/pull/8844)
8. Model Hub Page - fix text overflow issue [See PR](https://github.com/BerriAI/litellm/pull/8749)
9. Admin Settings Page - Allow adding MSFT SSO on UI
10. Backend - don't allow creating duplicate internal users in DB
## Helm
1. support ttlSecondsAfterFinished on the migration job - [See PR](https://github.com/BerriAI/litellm/pull/8593)
2. enhance migrations job with additional configurable properties - [See PR](https://github.com/BerriAI/litellm/pull/8636)
## Logging / Guardrail Integrations
1. Arize Phoenix support
2. No-log - fix no-log param support on embedding calls
## Performance / Loadbalancing / Reliability improvements
1. Single Deployment Cooldown logic - Use allowed_fails or allowed_fail_policy if set [Start here](https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type)
## General Proxy Improvements
1. Hypercorn - fix reading / parsing request body
2. Windows - fix running proxy in windows
3. DD-Trace - fix dd-trace enablement on proxy
## Complete Git Diff
View the complete git diff [here](https://github.com/BerriAI/litellm/compare/v1.61.13-stable...v1.61.20-stable).

View file

@ -0,0 +1,40 @@
---
title: v1.63.0 - Anthropic 'thinking' response update
slug: v1.63.0
date: 2025-03-05T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
## Diff
```bash
"message": {
...
"reasoning_content": "The capital of France is Paris.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The capital of France is Paris.",
- "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
+ "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
}
]
}
```

View file

@ -0,0 +1,112 @@
---
title: v1.63.2-stable
slug: v1.63.2-stable
date: 2025-03-08T10:00:00
authors:
- name: Krrish Dholakia
title: CEO, LiteLLM
url: https://www.linkedin.com/in/krish-d/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
- name: Ishaan Jaffer
title: CTO, LiteLLM
url: https://www.linkedin.com/in/reffajnaahsi/
image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
hide_table_of_contents: false
---
import Image from '@theme/IdealImage';
These are the changes since `v1.61.20-stable`.
This release is primarily focused on:
- LLM Translation improvements (more `thinking` content improvements)
- UI improvements (Error logs now shown on UI)
:::info
This release will be live on 03/09/2025
:::
<Image img={require('../../img/release_notes/v1632_release.jpg')} />
## Demo Instance
Here's a Demo Instance to test changes:
- Instance: https://demo.litellm.ai/
- Login Credentials:
- Username: admin
- Password: sk-1234
## New Models / Updated Models
1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
## LLM Translation
<Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
- Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
- Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API.
13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
## Spend Tracking Improvements
1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
## Management Endpoints / UI
<Image img={require('../../img/release_notes/error_logs.jpg')} />
1. Virtual Keys Page
- Allow team/org filters to be searchable on the Create Key Page
- Add created_by and updated_by fields to Keys table
- Show 'user_email' on key table
- Show 100 Keys Per Page, Use full height, increase width of key alias
2. Logs Page
- Show Error Logs on LiteLLM UI
- Allow Internal Users to View their own logs
3. Internal Users Page
- Allow admin to control default model access for internal users
7. Fix session handling with cookies
## Logging / Guardrail Integrations
1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
## Performance / Loadbalancing / Reliability improvements
1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
## General Proxy Improvements
1. Raise BadRequestError when unknown model passed in request
2. Enforce model access restrictions on Azure OpenAI proxy route
3. Reliability fix - Handle emojis in text - fix orjson error
4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
5. Enable setting timezone information in docker image
## Complete Git Diff
[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)

View file

@ -41,10 +41,12 @@ const sidebars = {
"proxy/deploy",
"proxy/prod",
"proxy/cli",
"proxy/release_cycle",
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/spending_monitoring",
"proxy/master_key_rotations",
],
},
"proxy/demo",
@ -242,6 +244,7 @@ const sidebars = {
"completion/document_understanding",
"completion/vision",
"completion/json_mode",
"reasoning_content",
"completion/prompt_caching",
"completion/predict_outputs",
"completion/prefix",
@ -254,13 +257,19 @@ const sidebars = {
"completion/batching",
"completion/mock_requests",
"completion/reliable_completions",
'tutorials/litellm_proxy_aporia',
]
},
{
type: "category",
label: "Supported Endpoints",
link: {
type: "generated-index",
title: "Supported Endpoints",
description:
"Learn how to deploy + call models from different providers on LiteLLM",
slug: "/supported_endpoints",
},
items: [
{
type: "category",
@ -279,6 +288,7 @@ const sidebars = {
},
"text_completion",
"embedding/supported_embedding",
"anthropic_unified",
{
type: "category",
label: "Image",
@ -348,23 +358,6 @@ const sidebars = {
label: "LangChain, LlamaIndex, Instructor Integration",
items: ["langchain/langchain", "tutorials/instructor"],
},
{
type: "category",
label: "Tutorials",
items: [
'tutorials/azure_openai',
'tutorials/instructor',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",
"tutorials/TogetherAI_liteLLM",
"tutorials/finetuned_chat_gpt",
"tutorials/text_completion",
"tutorials/first_playground",
"tutorials/model_fallbacks",
],
},
],
},
{
@ -422,6 +415,31 @@ const sidebars = {
"observability/opik_integration",
],
},
{
type: "category",
label: "Tutorials",
items: [
"tutorials/openweb_ui",
'tutorials/litellm_proxy_aporia',
{
type: "category",
label: "LiteLLM Python SDK Tutorials",
items: [
'tutorials/azure_openai',
'tutorials/instructor',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",
"tutorials/TogetherAI_liteLLM",
"tutorials/finetuned_chat_gpt",
"tutorials/text_completion",
"tutorials/first_playground",
"tutorials/model_fallbacks",
],
},
]
},
{
type: "category",
@ -444,6 +462,7 @@ const sidebars = {
items: [
"projects/smolagents",
"projects/Docq.AI",
"projects/PDL",
"projects/OpenInterpreter",
"projects/Elroy",
"projects/dbally",
@ -459,6 +478,7 @@ const sidebars = {
"projects/YiVal",
"projects/LiteLLM Proxy",
"projects/llm_cord",
"projects/pgai",
],
},
"contributing",

View file

@ -53,6 +53,7 @@ from litellm.constants import (
cohere_embedding_models,
bedrock_embedding_models,
known_tokenizer_config,
BEDROCK_INVOKE_PROVIDERS_LITERAL,
)
from litellm.types.guardrails import GuardrailItem
from litellm.proxy._types import (
@ -276,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None
force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
)
@ -361,17 +360,7 @@ BEDROCK_CONVERSE_MODELS = [
"meta.llama3-2-11b-instruct-v1:0",
"meta.llama3-2-90b-instruct-v1:0",
]
BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
"cohere",
"anthropic",
"mistral",
"amazon",
"meta",
"llama",
"ai21",
"nova",
"deepseek_r1",
]
####### COMPLETION MODELS ###################
open_ai_chat_completion_models: List = []
open_ai_text_completion_models: List = []
@ -809,9 +798,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
from .llms.maritalk import MaritalkConfig
from .llms.openrouter.chat.transformation import OpenrouterConfig
from .llms.anthropic.chat.transformation import AnthropicConfig
from .llms.anthropic.experimental_pass_through.transformation import (
AnthropicExperimentalPassThroughConfig,
)
from .llms.groq.stt.transformation import GroqSTTConfig
from .llms.anthropic.completion.transformation import AnthropicTextConfig
from .llms.triton.completion.transformation import TritonConfig
@ -830,6 +816,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
from .llms.clarifai.chat.transformation import ClarifaiConfig
from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
from .llms.anthropic.experimental_pass_through.messages.transformation import (
AnthropicMessagesConfig,
)
from .llms.together_ai.chat import TogetherAIConfig
from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -1020,6 +1009,7 @@ from .assistants.main import *
from .batches.main import *
from .batch_completion.main import * # type: ignore
from .rerank_api.main import *
from .llms.anthropic.experimental_pass_through.messages.handler import *
from .realtime_api.main import _arealtime
from .fine_tuning.main import *
from .files.main import *

View file

@ -1,186 +0,0 @@
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import traceback
from typing import Any, Optional
import litellm
from litellm import ChatCompletionRequest, verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
class AnthropicAdapter(CustomLogger):
def __init__(self) -> None:
super().__init__()
def translate_completion_input_params(
self, kwargs
) -> Optional[ChatCompletionRequest]:
"""
- translate params, where needed
- pass rest, as is
"""
request_body = AnthropicMessagesRequest(**kwargs) # type: ignore
translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
anthropic_message_request=request_body
)
return translated_body
def translate_completion_output_params(
self, response: ModelResponse
) -> Optional[AnthropicResponse]:
return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
response=response
)
def translate_completion_output_params_streaming(
self, completion_stream: Any
) -> AdapterCompletionStreamWrapper | None:
return AnthropicStreamWrapper(completion_stream=completion_stream)
anthropic_adapter = AnthropicAdapter()
class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
"""
- first chunk return 'message_start'
- content block must be started and stopped
- finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
"""
sent_first_chunk: bool = False
sent_content_block_start: bool = False
sent_content_block_finish: bool = False
sent_last_message: bool = False
holding_chunk: Optional[Any] = None
def __next__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except Exception as e:
verbose_logger.error(
"Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
)
async def __anext__(self):
try:
if self.sent_first_chunk is False:
self.sent_first_chunk = True
return {
"type": "message_start",
"message": {
"id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
"type": "message",
"role": "assistant",
"content": [],
"model": "claude-3-5-sonnet-20240620",
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": 25, "output_tokens": 1},
},
}
if self.sent_content_block_start is False:
self.sent_content_block_start = True
return {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
}
async for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
response=chunk
)
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
self.holding_chunk = processed_chunk
self.sent_content_block_finish = True
return {
"type": "content_block_stop",
"index": 0,
}
elif self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = processed_chunk
return return_chunk
else:
return processed_chunk
if self.holding_chunk is not None:
return_chunk = self.holding_chunk
self.holding_chunk = None
return return_chunk
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except StopIteration:
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopAsyncIteration

View file

@ -1,76 +1,16 @@
import asyncio
import datetime
import json
import threading
from typing import Any, List, Literal, Optional
from typing import Any, List, Literal, Tuple
import litellm
from litellm._logging import verbose_logger
from litellm.constants import (
BATCH_STATUS_POLL_INTERVAL_SECONDS,
BATCH_STATUS_POLL_MAX_ATTEMPTS,
)
from litellm.files.main import afile_content
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.llms.openai import Batch
from litellm.types.utils import StandardLoggingPayload, Usage
async def batches_async_logging(
batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
logging_obj: Optional[LiteLLMLoggingObj] = None,
**kwargs,
):
"""
Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
Polls retrieve_batch until it returns a batch with status "completed" or "failed"
"""
from .main import aretrieve_batch
verbose_logger.debug(
".....in _batches_async_logging... polling retrieve to get batch status"
)
if logging_obj is None:
raise ValueError(
"logging_obj is None cannot calculate cost / log batch creation event"
)
for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
try:
start_time = datetime.datetime.now()
batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
verbose_logger.debug(
"in _batches_async_logging... batch status= %s", batch.status
)
if batch.status == "completed":
end_time = datetime.datetime.now()
await _handle_completed_batch(
batch=batch,
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
start_time=start_time,
end_time=end_time,
**kwargs,
)
break
elif batch.status == "failed":
pass
except Exception as e:
verbose_logger.error("error in batches_async_logging", e)
await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
from litellm.types.utils import CallTypes, Usage
async def _handle_completed_batch(
batch: Batch,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
logging_obj: LiteLLMLoggingObj,
start_time: datetime.datetime,
end_time: datetime.datetime,
**kwargs,
) -> None:
) -> Tuple[float, Usage, List[str]]:
"""Helper function to process a completed batch and handle logging"""
# Get batch results
file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,52 +27,25 @@ async def _handle_completed_batch(
custom_llm_provider=custom_llm_provider,
)
# Handle logging
await _log_completed_batch(
logging_obj=logging_obj,
batch_usage=batch_usage,
batch_cost=batch_cost,
start_time=start_time,
end_time=end_time,
**kwargs,
)
batch_models = _get_batch_models_from_file_content(file_content_dictionary)
return batch_cost, batch_usage, batch_models
async def _log_completed_batch(
logging_obj: LiteLLMLoggingObj,
batch_usage: Usage,
batch_cost: float,
start_time: datetime.datetime,
end_time: datetime.datetime,
**kwargs,
) -> None:
"""Helper function to handle all logging operations for a completed batch"""
logging_obj.call_type = "batch_success"
standard_logging_object = _create_standard_logging_object_for_completed_batch(
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
logging_obj=logging_obj,
batch_usage_object=batch_usage,
response_cost=batch_cost,
)
logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
# Launch async and sync logging handlers
asyncio.create_task(
logging_obj.async_success_handler(
result=None,
start_time=start_time,
end_time=end_time,
cache_hit=None,
)
)
threading.Thread(
target=logging_obj.success_handler,
args=(None, start_time, end_time),
).start()
def _get_batch_models_from_file_content(
file_content_dictionary: List[dict],
) -> List[str]:
"""
Get the models from the file content
"""
batch_models = []
for _item in file_content_dictionary:
if _batch_response_was_successful(_item):
_response_body = _get_response_from_batch_job_output_file(_item)
_model = _response_body.get("model")
if _model:
batch_models.append(_model)
return batch_models
async def _batch_cost_calculator(
@ -159,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
"""
Get the batch output file content as a list of dictionaries
"""
from litellm.files.main import afile_content
if custom_llm_provider == "vertex_ai":
raise ValueError("Vertex AI does not support file content retrieval")
@ -208,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
total_cost += litellm.completion_cost(
completion_response=_response_body,
custom_llm_provider=custom_llm_provider,
call_type=CallTypes.aretrieve_batch.value,
)
verbose_logger.debug("total_cost=%s", total_cost)
return total_cost
@ -264,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
"""
_response: dict = batch_job_output_file.get("response", None) or {}
return _response.get("status_code", None) == 200
def _create_standard_logging_object_for_completed_batch(
kwargs: dict,
start_time: datetime.datetime,
end_time: datetime.datetime,
logging_obj: LiteLLMLoggingObj,
batch_usage_object: Usage,
response_cost: float,
) -> StandardLoggingPayload:
"""
Create a standard logging object for a completed batch
"""
standard_logging_object = logging_obj.model_call_details.get(
"standard_logging_object", None
)
if standard_logging_object is None:
raise ValueError("unable to create standard logging object for completed batch")
# Add Completed Batch Job Usage and Response Cost
standard_logging_object["call_type"] = "batch_success"
standard_logging_object["response_cost"] = response_cost
standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
return standard_logging_object

View file

@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
RetrieveBatchRequest,
)
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import LiteLLMBatch
from litellm.utils import client, get_litellm_params, supports_httpx_timeout
from .batch_utils import batches_async_logging
####### ENVIRONMENT VARIABLES ###################
openai_batches_instance = OpenAIBatchesAPI()
azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
else:
response = init_response
# Start async logging job
if response is not None:
asyncio.create_task(
batches_async_logging(
logging_obj=kwargs.get("litellm_logging_obj", None),
batch_id=response.id,
custom_llm_provider=custom_llm_provider,
**kwargs,
)
)
return response
except Exception as e:
raise e
@ -111,7 +99,7 @@ def create_batch(
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[Batch, Coroutine[Any, Any, Batch]]:
) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
"""
Creates and executes a batch from an uploaded file of request
@ -119,21 +107,26 @@ def create_batch(
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
litellm_call_id = kwargs.get("litellm_call_id", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
_is_async = kwargs.pop("acreate_batch", False) is True
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
litellm_call_id=kwargs.get("litellm_call_id", None),
litellm_trace_id=kwargs.get("litellm_trace_id"),
litellm_metadata=kwargs.get("litellm_metadata"),
)
litellm_logging_obj.update_environment_variables(
model=None,
user=None,
optional_params=optional_params.model_dump(),
litellm_params=litellm_params,
litellm_params={
"litellm_call_id": litellm_call_id,
"proxy_server_request": proxy_server_request,
"model_info": model_info,
"metadata": metadata,
"preset_cache_key": None,
"stream_response": {},
**optional_params.model_dump(exclude_unset=True),
},
custom_llm_provider=custom_llm_provider,
)
@ -261,7 +254,7 @@ def create_batch(
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
@ -269,6 +262,7 @@ def create_batch(
raise e
@client
async def aretrieve_batch(
batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +270,7 @@ async def aretrieve_batch(
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Batch:
) -> LiteLLMBatch:
"""
Async: Retrieves a batch.
@ -310,6 +304,7 @@ async def aretrieve_batch(
raise e
@client
def retrieve_batch(
batch_id: str,
custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +312,7 @@ def retrieve_batch(
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[Batch, Coroutine[Any, Any, Batch]]:
) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
"""
Retrieves a batch.
@ -325,9 +320,23 @@ def retrieve_batch(
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
litellm_params = get_litellm_params(
custom_llm_provider=custom_llm_provider,
litellm_call_id=kwargs.get("litellm_call_id", None),
litellm_trace_id=kwargs.get("litellm_trace_id"),
litellm_metadata=kwargs.get("litellm_metadata"),
)
litellm_logging_obj.update_environment_variables(
model=None,
user=None,
optional_params=optional_params.model_dump(),
litellm_params=litellm_params,
custom_llm_provider=custom_llm_provider,
)
if (
timeout is not None

View file

@ -13,26 +13,14 @@ import json
import time
import traceback
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Union
from typing import Any, Dict, List, Optional, Union
from openai.types.audio.transcription_create_params import TranscriptionCreateParams
from openai.types.chat.completion_create_params import (
CompletionCreateParamsNonStreaming,
CompletionCreateParamsStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
)
from openai.types.embedding_create_params import EmbeddingCreateParams
from pydantic import BaseModel
import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.types.caching import *
from litellm.types.rerank import RerankRequest
from litellm.types.utils import all_litellm_params
from .base_cache import BaseCache
@ -257,7 +245,7 @@ class Cache:
verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
return preset_cache_key
combined_kwargs = self._get_relevant_args_to_use_for_cache_key()
combined_kwargs = ModelParamHelper._get_all_llm_api_params()
litellm_param_kwargs = all_litellm_params
for param in kwargs:
if param in combined_kwargs:
@ -364,76 +352,6 @@ class Cache:
if "litellm_params" in kwargs:
kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key
def _get_relevant_args_to_use_for_cache_key(self) -> Set[str]:
"""
Gets the supported kwargs for each call type and combines them
"""
chat_completion_kwargs = self._get_litellm_supported_chat_completion_kwargs()
text_completion_kwargs = self._get_litellm_supported_text_completion_kwargs()
embedding_kwargs = self._get_litellm_supported_embedding_kwargs()
transcription_kwargs = self._get_litellm_supported_transcription_kwargs()
rerank_kwargs = self._get_litellm_supported_rerank_kwargs()
exclude_kwargs = self._get_kwargs_to_exclude_from_cache_key()
combined_kwargs = chat_completion_kwargs.union(
text_completion_kwargs,
embedding_kwargs,
transcription_kwargs,
rerank_kwargs,
)
combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs
def _get_litellm_supported_chat_completion_kwargs(self) -> Set[str]:
"""
Get the litellm supported chat completion kwargs
This follows the OpenAI API Spec
"""
all_chat_completion_kwargs = set(
CompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
return all_chat_completion_kwargs
def _get_litellm_supported_text_completion_kwargs(self) -> Set[str]:
"""
Get the litellm supported text completion kwargs
This follows the OpenAI API Spec
"""
all_text_completion_kwargs = set(
TextCompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
return all_text_completion_kwargs
def _get_litellm_supported_rerank_kwargs(self) -> Set[str]:
"""
Get the litellm supported rerank kwargs
"""
return set(RerankRequest.model_fields.keys())
def _get_litellm_supported_embedding_kwargs(self) -> Set[str]:
"""
Get the litellm supported embedding kwargs
This follows the OpenAI API Spec
"""
return set(EmbeddingCreateParams.__annotations__.keys())
def _get_litellm_supported_transcription_kwargs(self) -> Set[str]:
"""
Get the litellm supported transcription kwargs
This follows the OpenAI API Spec
"""
return set(TranscriptionCreateParams.__annotations__.keys())
def _get_kwargs_to_exclude_from_cache_key(self) -> Set[str]:
"""
Get the kwargs to exclude from the cache key
"""
return set(["metadata"])
@staticmethod
def _get_hashed_cache_key(cache_key: str) -> str:
"""

View file

@ -247,7 +247,6 @@ class LLMCachingHandler:
pass
else:
call_type = original_function.__name__
cached_result = self._convert_cached_result_to_model_response(
cached_result=cached_result,
call_type=call_type,
@ -725,6 +724,7 @@ class LLMCachingHandler:
"""
Sync internal method to add the result to the cache
"""
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
@ -738,6 +738,7 @@ class LLMCachingHandler:
if self._should_store_result_in_cache(
original_function=self.original_function, kwargs=new_kwargs
):
litellm.cache.add_cache(result, **new_kwargs)
return

View file

@ -543,6 +543,7 @@ class RedisCache(BaseCache):
_redis_client: Redis = self.init_async_client() # type: ignore
start_time = time.time()
_used_ttl = self.get_ttl(ttl=ttl)
key = self.check_and_fix_namespace(key=key)
try:
result = await _redis_client.incrbyfloat(name=key, amount=value)
if _used_ttl is not None:

View file

@ -1,4 +1,4 @@
from typing import List
from typing import List, Literal
ROUTER_MAX_FALLBACKS = 5
DEFAULT_BATCH_SIZE = 512
@ -120,6 +120,7 @@ OPENAI_CHAT_COMPLETION_PARAMS = [
"top_logprobs",
"reasoning_effort",
"extra_headers",
"thinking",
]
openai_compatible_endpoints: List = [
@ -319,6 +320,17 @@ baseten_models: List = [
"31dxrj3",
] # FALCON 7B # WizardLM # Mosaic ML
BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
"cohere",
"anthropic",
"mistral",
"amazon",
"meta",
"llama",
"ai21",
"nova",
"deepseek_r1",
]
open_ai_embedding_models: List = ["text-embedding-ada-002"]
cohere_embedding_models: List = [

View file

@ -239,6 +239,15 @@ def cost_per_token( # noqa: PLR0915
custom_llm_provider=custom_llm_provider,
billed_units=rerank_billed_units,
)
elif (
call_type == "aretrieve_batch"
or call_type == "retrieve_batch"
or call_type == CallTypes.aretrieve_batch
or call_type == CallTypes.retrieve_batch
):
return batch_cost_calculator(
usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
)
elif call_type == "atranscription" or call_type == "transcription":
return openai_cost_per_second(
model=model,
@ -399,9 +408,12 @@ def _select_model_name_for_cost_calc(
if base_model is not None:
return_model = base_model
completion_response_model: Optional[str] = getattr(
completion_response, "model", None
)
completion_response_model: Optional[str] = None
if completion_response is not None:
if isinstance(completion_response, BaseModel):
completion_response_model = getattr(completion_response, "model", None)
elif isinstance(completion_response, dict):
completion_response_model = completion_response.get("model", None)
hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
if completion_response_model is None and hidden_params is not None:
if (
@ -957,3 +969,54 @@ def default_image_cost_calculator(
)
return cost_info["input_cost_per_pixel"] * height * width * n
def batch_cost_calculator(
usage: Usage,
model: str,
custom_llm_provider: Optional[str] = None,
) -> Tuple[float, float]:
"""
Calculate the cost of a batch job
"""
_, custom_llm_provider, _, _ = litellm.get_llm_provider(
model=model, custom_llm_provider=custom_llm_provider
)
verbose_logger.info(
"Calculating batch cost per token. model=%s, custom_llm_provider=%s",
model,
custom_llm_provider,
)
try:
model_info: Optional[ModelInfo] = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
except Exception:
model_info = None
if not model_info:
return 0.0, 0.0
input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
input_cost_per_token = model_info.get("input_cost_per_token")
output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
output_cost_per_token = model_info.get("output_cost_per_token")
total_prompt_cost = 0.0
total_completion_cost = 0.0
if input_cost_per_token_batches:
total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
elif input_cost_per_token:
total_prompt_cost = (
usage.prompt_tokens * (input_cost_per_token) / 2
) # batch cost is usually half of the regular token cost
if output_cost_per_token_batches:
total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
elif output_cost_per_token:
total_completion_cost = (
usage.completion_tokens * (output_cost_per_token) / 2
) # batch cost is usually half of the regular token cost
return total_prompt_cost, total_completion_cost

View file

@ -816,7 +816,7 @@ def file_content(
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'file_content'. Only 'openai' and 'azure' are supported.".format(
message="LiteLLM doesn't support {} for 'custom_llm_provider'. Supported providers are 'openai', 'azure', 'vertex_ai'.".format(
custom_llm_provider
),
model="n/a",

View file

@ -577,6 +577,4 @@ class DataDogLogger(
start_time_utc: Optional[datetimeObj],
end_time_utc: Optional[datetimeObj],
) -> Optional[dict]:
raise NotImplementedError(
"Datdog Integration for getting request/response payloads not implemented as yet"
)
pass

View file

@ -40,6 +40,7 @@ in_memory_dynamic_logger_cache = DynamicLoggingCache()
def langfuse_client_init(
langfuse_public_key=None,
langfuse_secret=None,
langfuse_secret_key=None,
langfuse_host=None,
flush_interval=1,
) -> LangfuseClass:
@ -67,7 +68,10 @@ def langfuse_client_init(
)
# Instance variables
secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
secret_key = (
langfuse_secret or langfuse_secret_key or os.getenv("LANGFUSE_SECRET_KEY")
)
public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
langfuse_host = langfuse_host or os.getenv(
"LANGFUSE_HOST", "https://cloud.langfuse.com"
@ -190,6 +194,7 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
langfuse_client = langfuse_client_init(
langfuse_public_key=dynamic_callback_params.get("langfuse_public_key"),
langfuse_secret=dynamic_callback_params.get("langfuse_secret"),
langfuse_secret_key=dynamic_callback_params.get("langfuse_secret_key"),
langfuse_host=dynamic_callback_params.get("langfuse_host"),
)
langfuse_prompt_client = self._get_prompt_from_id(
@ -206,6 +211,7 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
langfuse_client = langfuse_client_init(
langfuse_public_key=dynamic_callback_params.get("langfuse_public_key"),
langfuse_secret=dynamic_callback_params.get("langfuse_secret"),
langfuse_secret_key=dynamic_callback_params.get("langfuse_secret_key"),
langfuse_host=dynamic_callback_params.get("langfuse_host"),
)
langfuse_prompt_client = self._get_prompt_from_id(

View file

@ -1560,10 +1560,18 @@ class PrometheusLogger(CustomLogger):
- Max Budget
- Budget Reset At
"""
self.litellm_remaining_team_budget_metric.labels(
team.team_id,
team.team_alias or "",
).set(
enum_values = UserAPIKeyLabelValues(
team=team.team_id,
team_alias=team.team_alias or "",
)
_labels = prometheus_label_factory(
supported_enum_labels=PrometheusMetricLabels.get_labels(
label_name="litellm_remaining_team_budget_metric"
),
enum_values=enum_values,
)
self.litellm_remaining_team_budget_metric.labels(**_labels).set(
self._safe_get_remaining_budget(
max_budget=team.max_budget,
spend=team.spend,
@ -1571,16 +1579,22 @@ class PrometheusLogger(CustomLogger):
)
if team.max_budget is not None:
self.litellm_team_max_budget_metric.labels(
team.team_id,
team.team_alias or "",
).set(team.max_budget)
_labels = prometheus_label_factory(
supported_enum_labels=PrometheusMetricLabels.get_labels(
label_name="litellm_team_max_budget_metric"
),
enum_values=enum_values,
)
self.litellm_team_max_budget_metric.labels(**_labels).set(team.max_budget)
if team.budget_reset_at is not None:
self.litellm_team_budget_remaining_hours_metric.labels(
team.team_id,
team.team_alias or "",
).set(
_labels = prometheus_label_factory(
supported_enum_labels=PrometheusMetricLabels.get_labels(
label_name="litellm_team_budget_remaining_hours_metric"
),
enum_values=enum_values,
)
self.litellm_team_budget_remaining_hours_metric.labels(**_labels).set(
self._get_remaining_hours_for_budget_reset(
budget_reset_at=team.budget_reset_at
)

View file

@ -73,8 +73,19 @@ def remove_index_from_tool_calls(
def get_litellm_metadata_from_kwargs(kwargs: dict):
"""
Helper to get litellm metadata from all litellm request kwargs
Return `litellm_metadata` if it exists, otherwise return `metadata`
"""
return kwargs.get("litellm_params", {}).get("metadata", {})
litellm_params = kwargs.get("litellm_params", {})
if litellm_params:
metadata = litellm_params.get("metadata", {})
litellm_metadata = litellm_params.get("litellm_metadata", {})
if litellm_metadata:
return litellm_metadata
elif metadata:
return metadata
return {}
# Helper functions used for OTEL logging

View file

@ -5,61 +5,69 @@ If the ddtrace package is not installed, the tracer will be a no-op.
"""
from contextlib import contextmanager
from typing import TYPE_CHECKING, Any, Union
from litellm.secret_managers.main import get_secret_bool
if TYPE_CHECKING:
from ddtrace.tracer import Tracer as DD_TRACER
else:
DD_TRACER = Any
class NullSpan:
"""A no-op span implementation."""
def __enter__(self):
return self
def __exit__(self, *args):
pass
def finish(self):
pass
@contextmanager
def null_tracer(name, **kwargs):
"""Context manager that yields a no-op span."""
yield NullSpan()
class NullTracer:
"""A no-op tracer implementation."""
def trace(self, name, **kwargs):
return NullSpan()
def wrap(self, name=None, **kwargs):
# If called with no arguments (as @tracer.wrap())
if callable(name):
return name
# If called with arguments (as @tracer.wrap(name="something"))
def decorator(f):
return f
return decorator
def _should_use_dd_tracer():
"""
Returns True if `USE_DDTRACE` is set to True in .env
"""
"""Returns True if `USE_DDTRACE` is set to True in .env"""
return get_secret_bool("USE_DDTRACE", False) is True
has_ddtrace = False
try:
from ddtrace import tracer as dd_tracer
# Initialize tracer
should_use_dd_tracer = _should_use_dd_tracer()
tracer: Union[NullTracer, DD_TRACER] = NullTracer()
# We need to ensure tracer is never None and always has the required methods
if should_use_dd_tracer:
try:
from ddtrace import tracer as dd_tracer
if _should_use_dd_tracer():
has_ddtrace = True
except ImportError:
has_ddtrace = False
@contextmanager
def null_tracer(name, **kwargs):
class NullSpan:
def __enter__(self):
return self
def __exit__(self, *args):
pass
def finish(self):
pass
yield NullSpan()
class NullTracer:
def trace(self, name, **kwargs):
class NullSpan:
def __enter__(self):
return self
def __exit__(self, *args):
pass
def finish(self):
pass
return NullSpan()
def wrap(self, name=None, **kwargs):
def decorator(f):
return f
return decorator
dd_tracer = NullTracer()
# Export the tracer instance
tracer = dd_tracer
# Define the type to match what's expected by the code using this module
tracer = dd_tracer
except ImportError:
tracer = NullTracer()
else:
tracer = NullTracer()

View file

@ -278,6 +278,7 @@ def exception_type( # type: ignore # noqa: PLR0915
"This model's maximum context length is" in error_str
or "string too long. Expected a string with maximum length"
in error_str
or "model's maximum context limit" in error_str
):
exception_mapping_worked = True
raise ContextWindowExceededError(
@ -692,6 +693,13 @@ def exception_type( # type: ignore # noqa: PLR0915
response=getattr(original_exception, "response", None),
litellm_debug_info=extra_information,
)
elif "model's maximum context limit" in error_str:
exception_mapping_worked = True
raise ContextWindowExceededError(
message=f"{custom_llm_provider}Exception: Context Window Error - {error_str}",
model=model,
llm_provider=custom_llm_provider,
)
elif "token_quota_reached" in error_str:
exception_mapping_worked = True
raise RateLimitError(

View file

@ -57,6 +57,7 @@ def get_litellm_params(
prompt_variables: Optional[dict] = None,
async_call: Optional[bool] = None,
ssl_verify: Optional[bool] = None,
merge_reasoning_content_in_choices: Optional[bool] = None,
**kwargs,
) -> dict:
litellm_params = {
@ -75,7 +76,7 @@ def get_litellm_params(
"model_info": model_info,
"proxy_server_request": proxy_server_request,
"preset_cache_key": preset_cache_key,
"no-log": no_log,
"no-log": no_log or kwargs.get("no-log"),
"stream_response": {}, # litellm_call_id: ModelResponse Dict
"input_cost_per_token": input_cost_per_token,
"input_cost_per_second": input_cost_per_second,
@ -97,5 +98,6 @@ def get_litellm_params(
"prompt_variables": prompt_variables,
"async_call": async_call,
"ssl_verify": ssl_verify,
"merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
}
return litellm_params

View file

@ -3,7 +3,6 @@
# Logging function -> log the exact model details + what's being sent | Non-Blocking
import copy
import datetime
from functools import lru_cache
import json
import os
import re
@ -13,6 +12,7 @@ import time
import traceback
import uuid
from datetime import datetime as dt_object
from functools import lru_cache
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
from pydantic import BaseModel
@ -25,6 +25,7 @@ from litellm import (
turn_off_message_logging,
)
from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler
from litellm.cost_calculator import _select_model_name_for_cost_calc
@ -33,6 +34,7 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.integrations.mlflow import MlflowLogger
from litellm.integrations.pagerduty.pagerduty import PagerDutyAlerting
from litellm.litellm_core_utils.get_litellm_params import get_litellm_params
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_custom_logger,
redact_message_input_output_from_logging,
@ -49,9 +51,11 @@ from litellm.types.utils import (
CallTypes,
EmbeddingResponse,
ImageResponse,
LiteLLMBatch,
LiteLLMLoggingBaseClass,
ModelResponse,
ModelResponseStream,
RawRequestTypedDict,
StandardCallbackDynamicParams,
StandardLoggingAdditionalHeaders,
StandardLoggingHiddenParams,
@ -202,6 +206,7 @@ class Logging(LiteLLMLoggingBaseClass):
] = None,
applied_guardrails: Optional[List[str]] = None,
kwargs: Optional[Dict] = None,
log_raw_request_response: bool = False,
):
_input: Optional[str] = messages # save original value of messages
if messages is not None:
@ -230,6 +235,7 @@ class Logging(LiteLLMLoggingBaseClass):
self.sync_streaming_chunks: List[Any] = (
[]
) # for generating complete stream response
self.log_raw_request_response = log_raw_request_response
# Initialize dynamic callbacks
self.dynamic_input_callbacks: Optional[
@ -450,6 +456,18 @@ class Logging(LiteLLMLoggingBaseClass):
return model, messages, non_default_params
def _get_raw_request_body(self, data: Optional[Union[dict, str]]) -> dict:
if data is None:
return {"error": "Received empty dictionary for raw request body"}
if isinstance(data, str):
try:
return json.loads(data)
except Exception:
return {
"error": "Unable to parse raw request body. Got - {}".format(data)
}
return data
def _pre_call(self, input, api_key, model=None, additional_args={}):
"""
Common helper function across the sync + async pre-call function
@ -465,6 +483,7 @@ class Logging(LiteLLMLoggingBaseClass):
self.model_call_details["model"] = model
def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915
# Log the exact input to the LLM API
litellm.error_logs["PRE_CALL"] = locals()
try:
@ -482,28 +501,54 @@ class Logging(LiteLLMLoggingBaseClass):
additional_args=additional_args,
)
# log raw request to provider (like LangFuse) -- if opted in.
if log_raw_request_response is True:
if (
self.log_raw_request_response is True
or log_raw_request_response is True
):
_litellm_params = self.model_call_details.get("litellm_params", {})
_metadata = _litellm_params.get("metadata", {}) or {}
try:
# [Non-blocking Extra Debug Information in metadata]
if (
turn_off_message_logging is not None
and turn_off_message_logging is True
):
if turn_off_message_logging is True:
_metadata["raw_request"] = (
"redacted by litellm. \
'litellm.turn_off_message_logging=True'"
)
else:
curl_command = self._get_request_curl_command(
api_base=additional_args.get("api_base", ""),
headers=additional_args.get("headers", {}),
additional_args=additional_args,
data=additional_args.get("complete_input_dict", {}),
)
_metadata["raw_request"] = str(curl_command)
# split up, so it's easier to parse in the UI
self.model_call_details["raw_request_typed_dict"] = (
RawRequestTypedDict(
raw_request_api_base=str(
additional_args.get("api_base") or ""
),
raw_request_body=self._get_raw_request_body(
additional_args.get("complete_input_dict", {})
),
raw_request_headers=self._get_masked_headers(
additional_args.get("headers", {}) or {},
ignore_sensitive_headers=True,
),
error=None,
)
)
except Exception as e:
self.model_call_details["raw_request_typed_dict"] = (
RawRequestTypedDict(
error=str(e),
)
)
traceback.print_exc()
_metadata["raw_request"] = (
"Unable to Log \
raw request: {}".format(
@ -636,9 +681,14 @@ class Logging(LiteLLMLoggingBaseClass):
)
verbose_logger.debug(f"\033[92m{curl_command}\033[0m\n")
def _get_request_body(self, data: dict) -> str:
return str(data)
def _get_request_curl_command(
self, api_base: str, headers: dict, additional_args: dict, data: dict
self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
) -> str:
if headers is None:
headers = {}
curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
curl_command += "curl -X POST \\\n"
curl_command += f"{api_base} \\\n"
@ -646,11 +696,10 @@ class Logging(LiteLLMLoggingBaseClass):
formatted_headers = " ".join(
[f"-H '{k}: {v}'" for k, v in masked_headers.items()]
)
curl_command += (
f"{formatted_headers} \\\n" if formatted_headers.strip() != "" else ""
)
curl_command += f"-d '{str(data)}'\n"
curl_command += f"-d '{self._get_request_body(data)}'\n"
if additional_args.get("request_str", None) is not None:
# print the sagemaker / bedrock client request
curl_command = "\nRequest Sent from LiteLLM:\n"
@ -659,12 +708,20 @@ class Logging(LiteLLMLoggingBaseClass):
curl_command = str(self.model_call_details)
return curl_command
def _get_masked_headers(self, headers: dict):
def _get_masked_headers(
self, headers: dict, ignore_sensitive_headers: bool = False
) -> dict:
"""
Internal debugging helper function
Masks the headers of the request sent from LiteLLM
"""
sensitive_keywords = [
"authorization",
"token",
"key",
"secret",
]
return {
k: (
(v[:-44] + "*" * 44)
@ -672,6 +729,11 @@ class Logging(LiteLLMLoggingBaseClass):
else "*****"
)
for k, v in headers.items()
if not ignore_sensitive_headers
or not any(
sensitive_keyword in k.lower()
for sensitive_keyword in sensitive_keywords
)
}
def post_call(
@ -870,6 +932,24 @@ class Logging(LiteLLMLoggingBaseClass):
return None
async def _response_cost_calculator_async(
self,
result: Union[
ModelResponse,
ModelResponseStream,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
TextCompletionResponse,
HttpxBinaryResponseContent,
RerankResponse,
Batch,
FineTuningJob,
],
cache_hit: Optional[bool] = None,
) -> Optional[float]:
return self._response_cost_calculator(result=result, cache_hit=cache_hit)
def should_run_callback(
self, callback: litellm.CALLBACK_TYPES, litellm_params: dict, event_hook: str
) -> bool:
@ -911,6 +991,9 @@ class Logging(LiteLLMLoggingBaseClass):
self.model_call_details["log_event_type"] = "successful_api_call"
self.model_call_details["end_time"] = end_time
self.model_call_details["cache_hit"] = cache_hit
if self.call_type == CallTypes.anthropic_messages.value:
result = self._handle_anthropic_messages_response_logging(result=result)
## if model in model cost map - log the response cost
## else set cost to None
if (
@ -927,8 +1010,8 @@ class Logging(LiteLLMLoggingBaseClass):
or isinstance(result, TextCompletionResponse)
or isinstance(result, HttpxBinaryResponseContent) # tts
or isinstance(result, RerankResponse)
or isinstance(result, Batch)
or isinstance(result, FineTuningJob)
or isinstance(result, LiteLLMBatch)
):
## HIDDEN PARAMS ##
hidden_params = getattr(result, "_hidden_params", {})
@ -1524,6 +1607,20 @@ class Logging(LiteLLMLoggingBaseClass):
print_verbose(
"Logging Details LiteLLM-Async Success Call, cache_hit={}".format(cache_hit)
)
## CALCULATE COST FOR BATCH JOBS
if self.call_type == CallTypes.aretrieve_batch.value and isinstance(
result, LiteLLMBatch
):
response_cost, batch_usage, batch_models = await _handle_completed_batch(
batch=result, custom_llm_provider=self.custom_llm_provider
)
result._hidden_params["response_cost"] = response_cost
result._hidden_params["batch_models"] = batch_models
result.usage = batch_usage
start_time, end_time, result = self._success_handler_helper_fn(
start_time=start_time,
end_time=end_time,
@ -1531,6 +1628,7 @@ class Logging(LiteLLMLoggingBaseClass):
cache_hit=cache_hit,
standard_logging_object=kwargs.get("standard_logging_object", None),
)
## BUILD COMPLETE STREAMED RESPONSE
if "async_complete_streaming_response" in self.model_call_details:
return # break out of this.
@ -2269,6 +2367,37 @@ class Logging(LiteLLMLoggingBaseClass):
return complete_streaming_response
return None
def _handle_anthropic_messages_response_logging(self, result: Any) -> ModelResponse:
"""
Handles logging for Anthropic messages responses.
Args:
result: The response object from the model call
Returns:
The the response object from the model call
- For Non-streaming responses, we need to transform the response to a ModelResponse object.
- For streaming responses, anthropic_messages handler calls success_handler with a assembled ModelResponse.
"""
if self.stream and isinstance(result, ModelResponse):
return result
result = litellm.AnthropicConfig().transform_response(
raw_response=self.model_call_details["httpx_response"],
model_response=litellm.ModelResponse(),
model=self.model,
messages=[],
logging_obj=self,
optional_params={},
api_key="",
request_data={},
encoding=litellm.encoding,
json_mode=False,
litellm_params={},
)
return result
def set_callbacks(callback_list, function_id=None): # noqa: PLR0915
"""
@ -2513,15 +2642,19 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
# auth can be disabled on local deployments of arize phoenix
if arize_phoenix_config.otlp_auth_headers is not None:
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = arize_phoenix_config.otlp_auth_headers
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
arize_phoenix_config.otlp_auth_headers
)
for callback in _in_memory_loggers:
if (
isinstance(callback, OpenTelemetry)
and callback.callback_name == "arize_phoenix"
):
return callback # type: ignore
_otel_logger = OpenTelemetry(config=otel_config, callback_name="arize_phoenix")
_otel_logger = OpenTelemetry(
config=otel_config, callback_name="arize_phoenix"
)
_in_memory_loggers.append(_otel_logger)
return _otel_logger # type: ignore
elif logging_integration == "otel":
@ -3081,6 +3214,7 @@ class StandardLoggingPayloadSetup:
response_cost=None,
additional_headers=None,
litellm_overhead_time_ms=None,
batch_models=None,
)
if hidden_params is not None:
for key in StandardLoggingHiddenParams.__annotations__.keys():
@ -3110,10 +3244,26 @@ class StandardLoggingPayloadSetup:
str(original_exception.__class__.__name__) if original_exception else ""
)
_llm_provider_in_exception = getattr(original_exception, "llm_provider", "")
# Get traceback information (first 100 lines)
traceback_info = ""
if original_exception:
tb = getattr(original_exception, "__traceback__", None)
if tb:
import traceback
tb_lines = traceback.format_tb(tb)
traceback_info = "".join(tb_lines[:100]) # Limit to first 100 lines
# Get additional error details
error_message = str(original_exception)
return StandardLoggingPayloadErrorInformation(
error_code=error_status,
error_class=error_class,
llm_provider=_llm_provider_in_exception,
traceback=traceback_info,
error_message=error_message if original_exception else "",
)
@staticmethod
@ -3178,6 +3328,7 @@ def get_standard_logging_object_payload(
api_base=None,
response_cost=None,
litellm_overhead_time_ms=None,
batch_models=None,
)
)
@ -3310,7 +3461,9 @@ def get_standard_logging_object_payload(
requester_ip_address=clean_metadata.get("requester_ip_address", None),
messages=kwargs.get("messages"),
response=final_response_obj,
model_parameters=kwargs.get("optional_params", None),
model_parameters=ModelParamHelper.get_standard_logging_model_parameters(
kwargs.get("optional_params", None) or {}
),
hidden_params=clean_hidden_params,
model_map_information=model_cost_information,
error_str=error_str,
@ -3460,6 +3613,7 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
response_cost=None,
additional_headers=None,
litellm_overhead_time_ms=None,
batch_models=None,
)
# Convert numeric values to appropriate types

View file

@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
import litellm
from litellm._logging import verbose_logger
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.types.llms.openai import ChatCompletionThinkingBlock
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall,
@ -128,12 +129,7 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
model_response_object = ModelResponse(stream=True)
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
delta = Delta(**choice["message"])
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
@ -243,6 +239,24 @@ def _parse_content_for_reasoning(
return None, message_text
def _extract_reasoning_content(message: dict) -> Tuple[Optional[str], Optional[str]]:
"""
Extract reasoning content and main content from a message.
Args:
message (dict): The message dictionary that may contain reasoning_content
Returns:
tuple[Optional[str], Optional[str]]: A tuple of (reasoning_content, content)
"""
if "reasoning_content" in message:
return message["reasoning_content"], message["content"]
elif "reasoning" in message:
return message["reasoning"], message["content"]
else:
return _parse_content_for_reasoning(message.get("content"))
class LiteLLMResponseObjectHandler:
@staticmethod
@ -456,11 +470,16 @@ def convert_to_model_response_object( # noqa: PLR0915
provider_specific_fields[field] = choice["message"][field]
# Handle reasoning models that display `reasoning_content` within `content`
reasoning_content, content = _parse_content_for_reasoning(
choice["message"].get("content")
reasoning_content, content = _extract_reasoning_content(
choice["message"]
)
# Handle thinking models that display `thinking_blocks` within `content`
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
if "thinking_blocks" in choice["message"]:
thinking_blocks = choice["message"]["thinking_blocks"]
provider_specific_fields["thinking_blocks"] = thinking_blocks
if reasoning_content:
provider_specific_fields["reasoning_content"] = (
reasoning_content
@ -473,6 +492,8 @@ def convert_to_model_response_object( # noqa: PLR0915
tool_calls=tool_calls,
audio=choice["message"].get("audio", None),
provider_specific_fields=provider_specific_fields,
reasoning_content=reasoning_content,
thinking_blocks=thinking_blocks,
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:

View file

@ -0,0 +1,133 @@
from typing import Set
from openai.types.audio.transcription_create_params import TranscriptionCreateParams
from openai.types.chat.completion_create_params import (
CompletionCreateParamsNonStreaming,
CompletionCreateParamsStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsNonStreaming as TextCompletionCreateParamsNonStreaming,
)
from openai.types.completion_create_params import (
CompletionCreateParamsStreaming as TextCompletionCreateParamsStreaming,
)
from openai.types.embedding_create_params import EmbeddingCreateParams
from litellm.types.rerank import RerankRequest
class ModelParamHelper:
@staticmethod
def get_standard_logging_model_parameters(
model_parameters: dict,
) -> dict:
""" """
standard_logging_model_parameters: dict = {}
supported_model_parameters = (
ModelParamHelper._get_relevant_args_to_use_for_logging()
)
for key, value in model_parameters.items():
if key in supported_model_parameters:
standard_logging_model_parameters[key] = value
return standard_logging_model_parameters
@staticmethod
def get_exclude_params_for_model_parameters() -> Set[str]:
return set(["messages", "prompt", "input"])
@staticmethod
def _get_relevant_args_to_use_for_logging() -> Set[str]:
"""
Gets all relevant llm api params besides the ones with prompt content
"""
all_openai_llm_api_params = ModelParamHelper._get_all_llm_api_params()
# Exclude parameters that contain prompt content
combined_kwargs = all_openai_llm_api_params.difference(
set(ModelParamHelper.get_exclude_params_for_model_parameters())
)
return combined_kwargs
@staticmethod
def _get_all_llm_api_params() -> Set[str]:
"""
Gets the supported kwargs for each call type and combines them
"""
chat_completion_kwargs = (
ModelParamHelper._get_litellm_supported_chat_completion_kwargs()
)
text_completion_kwargs = (
ModelParamHelper._get_litellm_supported_text_completion_kwargs()
)
embedding_kwargs = ModelParamHelper._get_litellm_supported_embedding_kwargs()
transcription_kwargs = (
ModelParamHelper._get_litellm_supported_transcription_kwargs()
)
rerank_kwargs = ModelParamHelper._get_litellm_supported_rerank_kwargs()
exclude_kwargs = ModelParamHelper._get_exclude_kwargs()
combined_kwargs = chat_completion_kwargs.union(
text_completion_kwargs,
embedding_kwargs,
transcription_kwargs,
rerank_kwargs,
)
combined_kwargs = combined_kwargs.difference(exclude_kwargs)
return combined_kwargs
@staticmethod
def _get_litellm_supported_chat_completion_kwargs() -> Set[str]:
"""
Get the litellm supported chat completion kwargs
This follows the OpenAI API Spec
"""
all_chat_completion_kwargs = set(
CompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(CompletionCreateParamsStreaming.__annotations__.keys()))
return all_chat_completion_kwargs
@staticmethod
def _get_litellm_supported_text_completion_kwargs() -> Set[str]:
"""
Get the litellm supported text completion kwargs
This follows the OpenAI API Spec
"""
all_text_completion_kwargs = set(
TextCompletionCreateParamsNonStreaming.__annotations__.keys()
).union(set(TextCompletionCreateParamsStreaming.__annotations__.keys()))
return all_text_completion_kwargs
@staticmethod
def _get_litellm_supported_rerank_kwargs() -> Set[str]:
"""
Get the litellm supported rerank kwargs
"""
return set(RerankRequest.model_fields.keys())
@staticmethod
def _get_litellm_supported_embedding_kwargs() -> Set[str]:
"""
Get the litellm supported embedding kwargs
This follows the OpenAI API Spec
"""
return set(EmbeddingCreateParams.__annotations__.keys())
@staticmethod
def _get_litellm_supported_transcription_kwargs() -> Set[str]:
"""
Get the litellm supported transcription kwargs
This follows the OpenAI API Spec
"""
return set(TranscriptionCreateParams.__annotations__.keys())
@staticmethod
def _get_exclude_kwargs() -> Set[str]:
"""
Get the kwargs to exclude from the cache key
"""
return set(["metadata"])

View file

@ -187,53 +187,125 @@ def ollama_pt(
final_prompt_value="### Response:",
messages=messages,
)
elif "llava" in model:
prompt = ""
images = []
for message in messages:
if isinstance(message["content"], str):
prompt += message["content"]
elif isinstance(message["content"], list):
# see https://docs.litellm.ai/docs/providers/openai#openai-vision-models
for element in message["content"]:
if isinstance(element, dict):
if element["type"] == "text":
prompt += element["text"]
elif element["type"] == "image_url":
base64_image = convert_to_ollama_image(
element["image_url"]["url"]
)
images.append(base64_image)
return {"prompt": prompt, "images": images}
else:
user_message_types = {"user", "tool", "function"}
msg_i = 0
images = []
prompt = ""
for message in messages:
role = message["role"]
content = message.get("content", "")
while msg_i < len(messages):
init_msg_i = msg_i
user_content_str = ""
## MERGE CONSECUTIVE USER CONTENT ##
while (
msg_i < len(messages) and messages[msg_i]["role"] in user_message_types
):
msg_content = messages[msg_i].get("content")
if msg_content:
if isinstance(msg_content, list):
for m in msg_content:
if m.get("type", "") == "image_url":
if isinstance(m["image_url"], str):
images.append(m["image_url"])
elif isinstance(m["image_url"], dict):
images.append(m["image_url"]["url"])
elif m.get("type", "") == "text":
user_content_str += m["text"]
else:
# Tool message content will always be a string
user_content_str += msg_content
if "tool_calls" in message:
tool_calls = []
msg_i += 1
for call in message["tool_calls"]:
call_id: str = call["id"]
function_name: str = call["function"]["name"]
arguments = json.loads(call["function"]["arguments"])
if user_content_str:
prompt += f"### User:\n{user_content_str}\n\n"
tool_calls.append(
{
"id": call_id,
"type": "function",
"function": {"name": function_name, "arguments": arguments},
}
assistant_content_str = ""
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
msg_content = messages[msg_i].get("content")
if msg_content:
if isinstance(msg_content, list):
for m in msg_content:
if m.get("type", "") == "text":
assistant_content_str += m["text"]
elif isinstance(msg_content, str):
# Tool message content will always be a string
assistant_content_str += msg_content
tool_calls = messages[msg_i].get("tool_calls")
ollama_tool_calls = []
if tool_calls:
for call in tool_calls:
call_id: str = call["id"]
function_name: str = call["function"]["name"]
arguments = json.loads(call["function"]["arguments"])
ollama_tool_calls.append(
{
"id": call_id,
"type": "function",
"function": {
"name": function_name,
"arguments": arguments,
},
}
)
if ollama_tool_calls:
assistant_content_str += (
f"Tool Calls: {json.dumps(ollama_tool_calls, indent=2)}"
)
prompt += f"### Assistant:\nTool Calls: {json.dumps(tool_calls, indent=2)}\n\n"
msg_i += 1
elif "tool_call_id" in message:
prompt += f"### User:\n{message['content']}\n\n"
if assistant_content_str:
prompt += f"### Assistant:\n{assistant_content_str}\n\n"
elif content:
prompt += f"### {role.capitalize()}:\n{content}\n\n"
if msg_i == init_msg_i: # prevent infinite loops
raise litellm.BadRequestError(
message=BAD_MESSAGE_ERROR_STR + f"passed in {messages[msg_i]}",
model=model,
llm_provider="ollama",
)
# prompt = ""
# images = []
# for message in messages:
# if isinstance(message["content"], str):
# prompt += message["content"]
# elif isinstance(message["content"], list):
# # see https://docs.litellm.ai/docs/providers/openai#openai-vision-models
# for element in message["content"]:
# if isinstance(element, dict):
# if element["type"] == "text":
# prompt += element["text"]
# elif element["type"] == "image_url":
# base64_image = convert_to_ollama_image(
# element["image_url"]["url"]
# )
# images.append(base64_image)
# if "tool_calls" in message:
# tool_calls = []
# for call in message["tool_calls"]:
# call_id: str = call["id"]
# function_name: str = call["function"]["name"]
# arguments = json.loads(call["function"]["arguments"])
# tool_calls.append(
# {
# "id": call_id,
# "type": "function",
# "function": {"name": function_name, "arguments": arguments},
# }
# )
# prompt += f"### Assistant:\nTool Calls: {json.dumps(tool_calls, indent=2)}\n\n"
# elif "tool_call_id" in message:
# prompt += f"### User:\n{message['content']}\n\n"
return {"prompt": prompt, "images": images}
return prompt
@ -680,12 +752,13 @@ def convert_generic_image_chunk_to_openai_image_obj(
Return:
"data:image/jpeg;base64,{base64_image}"
"""
return "data:{};{},{}".format(
image_chunk["media_type"], image_chunk["type"], image_chunk["data"]
)
media_type = image_chunk["media_type"]
return "data:{};{},{}".format(media_type, image_chunk["type"], image_chunk["data"])
def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
def convert_to_anthropic_image_obj(
openai_image_url: str, format: Optional[str]
) -> GenericImageParsingChunk:
"""
Input:
"image_url": "data:image/jpeg;base64,{base64_image}",
@ -702,7 +775,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsing
openai_image_url = convert_url_to_base64(url=openai_image_url)
# Extract the media type and base64 data
media_type, base64_data = openai_image_url.split("data:")[1].split(";base64,")
media_type = media_type.replace("\\/", "/")
if format:
media_type = format
else:
media_type = media_type.replace("\\/", "/")
return GenericImageParsingChunk(
type="base64",
@ -820,11 +897,12 @@ def anthropic_messages_pt_xml(messages: list):
if isinstance(messages[msg_i]["content"], list):
for m in messages[msg_i]["content"]:
if m.get("type", "") == "image_url":
format = m["image_url"].get("format")
user_content.append(
{
"type": "image",
"source": convert_to_anthropic_image_obj(
m["image_url"]["url"]
m["image_url"]["url"], format=format
),
}
)
@ -1156,10 +1234,13 @@ def convert_to_anthropic_tool_result(
)
elif content["type"] == "image_url":
if isinstance(content["image_url"], str):
image_chunk = convert_to_anthropic_image_obj(content["image_url"])
else:
image_chunk = convert_to_anthropic_image_obj(
content["image_url"]["url"]
content["image_url"], format=None
)
else:
format = content["image_url"].get("format")
image_chunk = convert_to_anthropic_image_obj(
content["image_url"]["url"], format=format
)
anthropic_content_list.append(
AnthropicMessagesImageParam(
@ -1282,6 +1363,7 @@ def add_cache_control_to_content(
AnthropicMessagesImageParam,
AnthropicMessagesTextParam,
AnthropicMessagesDocumentParam,
ChatCompletionThinkingBlock,
],
orignal_content_element: Union[dict, AllMessageValues],
):
@ -1317,6 +1399,7 @@ def _anthropic_content_element_factory(
data=image_chunk["data"],
),
)
return _anthropic_content_element
@ -1368,13 +1451,16 @@ def anthropic_messages_pt( # noqa: PLR0915
for m in user_message_types_block["content"]:
if m.get("type", "") == "image_url":
m = cast(ChatCompletionImageObject, m)
format: Optional[str] = None
if isinstance(m["image_url"], str):
image_chunk = convert_to_anthropic_image_obj(
openai_image_url=m["image_url"]
openai_image_url=m["image_url"], format=None
)
else:
format = m["image_url"].get("format")
image_chunk = convert_to_anthropic_image_obj(
openai_image_url=m["image_url"]["url"]
openai_image_url=m["image_url"]["url"],
format=format,
)
_anthropic_content_element = (
@ -1454,12 +1540,23 @@ def anthropic_messages_pt( # noqa: PLR0915
assistant_content_block["content"], list
):
for m in assistant_content_block["content"]:
# handle text
# handle thinking blocks
thinking_block = cast(str, m.get("thinking", ""))
text_block = cast(str, m.get("text", ""))
if (
m.get("type", "") == "text" and len(m.get("text", "")) > 0
m.get("type", "") == "thinking" and len(thinking_block) > 0
): # don't pass empty text blocks. anthropic api raises errors.
anthropic_message: Union[
ChatCompletionThinkingBlock,
AnthropicMessagesTextParam,
] = cast(ChatCompletionThinkingBlock, m)
assistant_content.append(anthropic_message)
# handle text
elif (
m.get("type", "") == "text" and len(text_block) > 0
): # don't pass empty text blocks. anthropic api raises errors.
anthropic_message = AnthropicMessagesTextParam(
type="text", text=m.get("text")
type="text", text=text_block
)
_cached_message = add_cache_control_to_content(
anthropic_content_element=anthropic_message,
@ -1512,6 +1609,7 @@ def anthropic_messages_pt( # noqa: PLR0915
msg_i += 1
if assistant_content:
new_messages.append({"role": "assistant", "content": assistant_content})
if msg_i == init_msg_i: # prevent infinite loops
@ -1520,17 +1618,6 @@ def anthropic_messages_pt( # noqa: PLR0915
model=model,
llm_provider=llm_provider,
)
if not new_messages or new_messages[0]["role"] != "user":
if litellm.modify_params:
new_messages.insert(
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
)
else:
raise Exception(
"Invalid first message={}. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, ".format(
new_messages
)
)
if new_messages[-1]["role"] == "assistant":
if isinstance(new_messages[-1]["content"], str):
@ -2151,6 +2238,10 @@ from email.message import Message
import httpx
from litellm.types.llms.bedrock import (
BedrockConverseReasoningContentBlock,
BedrockConverseReasoningTextBlock,
)
from litellm.types.llms.bedrock import ContentBlock as BedrockContentBlock
from litellm.types.llms.bedrock import DocumentBlock as BedrockDocumentBlock
from litellm.types.llms.bedrock import ImageBlock as BedrockImageBlock
@ -2297,8 +2388,11 @@ class BedrockImageProcessor:
)
@classmethod
def process_image_sync(cls, image_url: str) -> BedrockContentBlock:
def process_image_sync(
cls, image_url: str, format: Optional[str] = None
) -> BedrockContentBlock:
"""Synchronous image processing."""
if "base64" in image_url:
img_bytes, mime_type, image_format = cls._parse_base64_image(image_url)
elif "http://" in image_url or "https://" in image_url:
@ -2309,11 +2403,17 @@ class BedrockImageProcessor:
"Unsupported image type. Expected either image url or base64 encoded string"
)
if format:
mime_type = format
image_format = mime_type.split("/")[1]
image_format = cls._validate_format(mime_type, image_format)
return cls._create_bedrock_block(img_bytes, mime_type, image_format)
@classmethod
async def process_image_async(cls, image_url: str) -> BedrockContentBlock:
async def process_image_async(
cls, image_url: str, format: Optional[str]
) -> BedrockContentBlock:
"""Asynchronous image processing."""
if "base64" in image_url:
@ -2328,6 +2428,10 @@ class BedrockImageProcessor:
"Unsupported image type. Expected either image url or base64 encoded string"
)
if format: # override with user-defined params
mime_type = format
image_format = mime_type.split("/")[1]
image_format = cls._validate_format(mime_type, image_format)
return cls._create_bedrock_block(img_bytes, mime_type, image_format)
@ -2815,12 +2919,14 @@ class BedrockConverseMessagesProcessor:
_part = BedrockContentBlock(text=element["text"])
_parts.append(_part)
elif element["type"] == "image_url":
format: Optional[str] = None
if isinstance(element["image_url"], dict):
image_url = element["image_url"]["url"]
format = element["image_url"].get("format")
else:
image_url = element["image_url"]
_part = await BedrockImageProcessor.process_image_async( # type: ignore
image_url=image_url
image_url=image_url, format=format
)
_parts.append(_part) # type: ignore
_cache_point_block = (
@ -2920,7 +3026,14 @@ class BedrockConverseMessagesProcessor:
assistants_parts: List[BedrockContentBlock] = []
for element in _assistant_content:
if isinstance(element, dict):
if element["type"] == "text":
if element["type"] == "thinking":
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks=[
cast(ChatCompletionThinkingBlock, element)
]
)
assistants_parts.extend(thinking_block)
elif element["type"] == "text":
assistants_part = BedrockContentBlock(
text=element["text"]
)
@ -2963,6 +3076,28 @@ class BedrockConverseMessagesProcessor:
return contents
@staticmethod
def translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks: List[ChatCompletionThinkingBlock],
) -> List[BedrockContentBlock]:
reasoning_content_blocks: List[BedrockContentBlock] = []
for thinking_block in thinking_blocks:
reasoning_text = thinking_block.get("thinking")
reasoning_signature = thinking_block.get("signature")
text_block = BedrockConverseReasoningTextBlock(
text=reasoning_text or "",
)
if reasoning_signature is not None:
text_block["signature"] = reasoning_signature
reasoning_content_block = BedrockConverseReasoningContentBlock(
reasoningText=text_block,
)
bedrock_content_block = BedrockContentBlock(
reasoningContent=reasoning_content_block
)
reasoning_content_blocks.append(bedrock_content_block)
return reasoning_content_blocks
def _bedrock_converse_messages_pt( # noqa: PLR0915
messages: List,
@ -3024,12 +3159,15 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
_part = BedrockContentBlock(text=element["text"])
_parts.append(_part)
elif element["type"] == "image_url":
format: Optional[str] = None
if isinstance(element["image_url"], dict):
image_url = element["image_url"]["url"]
format = element["image_url"].get("format")
else:
image_url = element["image_url"]
_part = BedrockImageProcessor.process_image_sync( # type: ignore
image_url=image_url
image_url=image_url,
format=format,
)
_parts.append(_part) # type: ignore
_cache_point_block = (
@ -3109,17 +3247,36 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
assistant_content: List[BedrockContentBlock] = []
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
assistant_message_block = get_assistant_message_block_or_continue_message(
message=messages[msg_i],
assistant_continue_message=assistant_continue_message,
)
_assistant_content = assistant_message_block.get("content", None)
thinking_blocks = cast(
Optional[List[ChatCompletionThinkingBlock]],
assistant_message_block.get("thinking_blocks"),
)
if thinking_blocks is not None:
assistant_content.extend(
BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks
)
)
if _assistant_content is not None and isinstance(_assistant_content, list):
assistants_parts: List[BedrockContentBlock] = []
for element in _assistant_content:
if isinstance(element, dict):
if element["type"] == "text":
if element["type"] == "thinking":
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks=[
cast(ChatCompletionThinkingBlock, element)
]
)
assistants_parts.extend(thinking_block)
elif element["type"] == "text":
assistants_part = BedrockContentBlock(text=element["text"])
assistants_parts.append(assistants_part)
elif element["type"] == "image_url":

View file

@ -5,7 +5,7 @@ import threading
import time
import traceback
import uuid
from typing import Any, Callable, Dict, List, Optional, cast
from typing import Any, Callable, Dict, List, Optional, Union, cast
import httpx
from pydantic import BaseModel
@ -14,6 +14,8 @@ import litellm
from litellm import verbose_logger
from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.types.llms.openai import ChatCompletionChunk
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import Delta
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import (
@ -69,6 +71,17 @@ class CustomStreamWrapper:
self.completion_stream = completion_stream
self.sent_first_chunk = False
self.sent_last_chunk = False
litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
**self.logging_obj.model_call_details.get("litellm_params", {})
)
self.merge_reasoning_content_in_choices: bool = (
litellm_params.merge_reasoning_content_in_choices or False
)
self.sent_first_thinking_block = False
self.sent_last_thinking_block = False
self.thinking_content = ""
self.system_fingerprint: Optional[str] = None
self.received_finish_reason: Optional[str] = None
self.intermittent_finish_reason: Optional[str] = (
@ -86,12 +99,7 @@ class CustomStreamWrapper:
self.holding_chunk = ""
self.complete_response = ""
self.response_uptil_now = ""
_model_info = (
self.logging_obj.model_call_details.get("litellm_params", {}).get(
"model_info", {}
)
or {}
)
_model_info: Dict = litellm_params.model_info or {}
_api_base = get_api_base(
model=model or "",
@ -110,7 +118,7 @@ class CustomStreamWrapper:
) # GUARANTEE OPENAI HEADERS IN RESPONSE
self._response_headers = _response_headers
self.response_id = None
self.response_id: Optional[str] = None
self.logging_loop = None
self.rules = Rules()
self.stream_options = stream_options or getattr(
@ -629,7 +637,10 @@ class CustomStreamWrapper:
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8")
if "text_output" in chunk:
response = chunk.replace("data: ", "").strip()
response = (
CustomStreamWrapper._strip_sse_data_from_chunk(chunk) or ""
)
response = response.strip()
parsed_response = json.loads(response)
else:
return {
@ -713,7 +724,7 @@ class CustomStreamWrapper:
def is_delta_empty(self, delta: Delta) -> bool:
is_empty = True
if delta.content is not None:
if delta.content:
is_empty = False
elif delta.tool_calls is not None:
is_empty = False
@ -721,16 +732,45 @@ class CustomStreamWrapper:
is_empty = False
return is_empty
def return_processed_chunk_logic( # noqa
def set_model_id(
self, id: str, model_response: ModelResponseStream
) -> ModelResponseStream:
"""
Set the model id and response id to the given id.
Ensure model id is always the same across all chunks.
If first chunk sent + id set, use that id for all chunks.
"""
if self.response_id is None:
self.response_id = id
if self.response_id is not None and isinstance(self.response_id, str):
model_response.id = self.response_id
return model_response
def copy_model_response_level_provider_specific_fields(
self,
original_chunk: Union[ModelResponseStream, ChatCompletionChunk],
model_response: ModelResponseStream,
) -> ModelResponseStream:
"""
Copy provider_specific_fields from original_chunk to model_response.
"""
provider_specific_fields = getattr(
original_chunk, "provider_specific_fields", None
)
if provider_specific_fields is not None:
model_response.provider_specific_fields = provider_specific_fields
for k, v in provider_specific_fields.items():
setattr(model_response, k, v)
return model_response
def is_chunk_non_empty(
self,
completion_obj: Dict[str, Any],
model_response: ModelResponseStream,
response_obj: Dict[str, Any],
):
print_verbose(
f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
)
) -> bool:
if (
"content" in completion_obj
and (
@ -746,13 +786,40 @@ class CustomStreamWrapper:
"function_call" in completion_obj
and completion_obj["function_call"] is not None
)
or (
"reasoning_content" in model_response.choices[0].delta
and model_response.choices[0].delta.reasoning_content is not None
)
or (model_response.choices[0].delta.provider_specific_fields is not None)
or (
"provider_specific_fields" in model_response
and model_response.choices[0].delta.provider_specific_fields is not None
)
or (
"provider_specific_fields" in response_obj
and response_obj["provider_specific_fields"] is not None
)
): # cannot set content of an OpenAI Object to be an empty string
):
return True
else:
return False
def return_processed_chunk_logic( # noqa
self,
completion_obj: Dict[str, Any],
model_response: ModelResponseStream,
response_obj: Dict[str, Any],
):
print_verbose(
f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
)
is_chunk_non_empty = self.is_chunk_non_empty(
completion_obj, model_response, response_obj
)
if (
is_chunk_non_empty
): # cannot set content of an OpenAI Object to be an empty string
self.safety_checker()
hold, model_response_str = self.check_special_tokens(
chunk=completion_obj["content"],
@ -763,14 +830,12 @@ class CustomStreamWrapper:
## check if openai/azure chunk
original_chunk = response_obj.get("original_chunk", None)
if original_chunk:
model_response.id = original_chunk.id
self.response_id = original_chunk.id
if len(original_chunk.choices) > 0:
choices = []
for choice in original_chunk.choices:
try:
if isinstance(choice, BaseModel):
choice_json = choice.model_dump()
choice_json = choice.model_dump() # type: ignore
choice_json.pop(
"finish_reason", None
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
@ -798,9 +863,10 @@ class CustomStreamWrapper:
model_response.choices[0].delta, "role"
):
_initial_delta = model_response.choices[0].delta.model_dump()
_initial_delta.pop("role", None)
model_response.choices[0].delta = Delta(**_initial_delta)
print_verbose(
verbose_logger.debug(
f"model_response.choices[0].delta: {model_response.choices[0].delta}"
)
else:
@ -817,6 +883,10 @@ class CustomStreamWrapper:
_index: Optional[int] = completion_obj.get("index")
if _index is not None:
model_response.choices[0].index = _index
self._optional_combine_thinking_block_in_choices(
model_response=model_response
)
print_verbose(f"returning model_response: {model_response}")
return model_response
else:
@ -842,6 +912,9 @@ class CustomStreamWrapper:
_is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
if _is_delta_empty:
model_response.choices[0].delta = Delta(
content=None
) # ensure empty delta chunk returned
# get any function call arguments
model_response.choices[0].finish_reason = map_finish_reason(
finish_reason=self.received_finish_reason
@ -870,7 +943,49 @@ class CustomStreamWrapper:
self.chunks.append(model_response)
return
def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
def _optional_combine_thinking_block_in_choices(
self, model_response: ModelResponseStream
) -> None:
"""
UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
In place updates the model_response object with reasoning_content in content with <think>...</think> tags
Enabled when `merge_reasoning_content_in_choices=True` passed in request params
"""
if self.merge_reasoning_content_in_choices is True:
reasoning_content = getattr(
model_response.choices[0].delta, "reasoning_content", None
)
if reasoning_content:
if self.sent_first_thinking_block is False:
model_response.choices[0].delta.content += (
"<think>" + reasoning_content
)
self.sent_first_thinking_block = True
elif (
self.sent_first_thinking_block is True
and hasattr(model_response.choices[0].delta, "reasoning_content")
and model_response.choices[0].delta.reasoning_content
):
model_response.choices[0].delta.content = reasoning_content
elif (
self.sent_first_thinking_block is True
and not self.sent_last_thinking_block
and model_response.choices[0].delta.content
):
model_response.choices[0].delta.content = (
"</think>" + model_response.choices[0].delta.content
)
self.sent_last_thinking_block = True
if hasattr(model_response.choices[0].delta, "reasoning_content"):
del model_response.choices[0].delta.reasoning_content
return
def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915
model_response = self.model_response_creator()
response_obj: Dict[str, Any] = {}
@ -886,16 +1001,13 @@ class CustomStreamWrapper:
) # check if chunk is a generic streaming chunk
) or (
self.custom_llm_provider
and (
self.custom_llm_provider == "anthropic"
or self.custom_llm_provider in litellm._custom_providers
)
and self.custom_llm_provider in litellm._custom_providers
):
if self.received_finish_reason is not None:
if "provider_specific_fields" not in chunk:
raise StopIteration
anthropic_response_obj: GChunk = chunk
anthropic_response_obj: GChunk = cast(GChunk, chunk)
completion_obj["content"] = anthropic_response_obj["text"]
if anthropic_response_obj["is_finished"]:
self.received_finish_reason = anthropic_response_obj[
@ -927,7 +1039,7 @@ class CustomStreamWrapper:
].items():
setattr(model_response, key, value)
response_obj = anthropic_response_obj
response_obj = cast(Dict[str, Any], anthropic_response_obj)
elif self.model == "replicate" or self.custom_llm_provider == "replicate":
response_obj = self.handle_replicate_chunk(chunk)
completion_obj["content"] = response_obj["text"]
@ -989,6 +1101,7 @@ class CustomStreamWrapper:
try:
completion_obj["content"] = chunk.text
except Exception as e:
original_exception = e
if "Part has no text." in str(e):
## check for function calling
function_call = (
@ -1030,7 +1143,7 @@ class CustomStreamWrapper:
_model_response.choices = [_streaming_response]
response_obj = {"original_chunk": _model_response}
else:
raise e
raise original_exception
if (
hasattr(chunk.candidates[0], "finish_reason")
and chunk.candidates[0].finish_reason.name
@ -1093,8 +1206,9 @@ class CustomStreamWrapper:
total_tokens=response_obj["usage"].total_tokens,
)
elif self.custom_llm_provider == "text-completion-codestral":
response_obj = litellm.CodestralTextCompletionConfig()._chunk_parser(
chunk
response_obj = cast(
Dict[str, Any],
litellm.CodestralTextCompletionConfig()._chunk_parser(chunk),
)
completion_obj["content"] = response_obj["text"]
print_verbose(f"completion obj content: {completion_obj['content']}")
@ -1156,8 +1270,9 @@ class CustomStreamWrapper:
self.received_finish_reason = response_obj["finish_reason"]
if response_obj.get("original_chunk", None) is not None:
if hasattr(response_obj["original_chunk"], "id"):
model_response.id = response_obj["original_chunk"].id
self.response_id = model_response.id
model_response = self.set_model_id(
response_obj["original_chunk"].id, model_response
)
if hasattr(response_obj["original_chunk"], "system_fingerprint"):
model_response.system_fingerprint = response_obj[
"original_chunk"
@ -1206,8 +1321,16 @@ class CustomStreamWrapper:
): # function / tool calling branch - only set for openai/azure compatible endpoints
# enter this branch when no content has been passed in response
original_chunk = response_obj.get("original_chunk", None)
model_response.id = original_chunk.id
self.response_id = original_chunk.id
if hasattr(original_chunk, "id"):
model_response = self.set_model_id(
original_chunk.id, model_response
)
if hasattr(original_chunk, "provider_specific_fields"):
model_response = (
self.copy_model_response_level_provider_specific_fields(
original_chunk, model_response
)
)
if original_chunk.choices and len(original_chunk.choices) > 0:
delta = original_chunk.choices[0].delta
if delta is not None and (
@ -1708,6 +1831,42 @@ class CustomStreamWrapper:
extra_kwargs={},
)
@staticmethod
def _strip_sse_data_from_chunk(chunk: Optional[str]) -> Optional[str]:
"""
Strips the 'data: ' prefix from Server-Sent Events (SSE) chunks.
Some providers like sagemaker send it as `data:`, need to handle both
SSE messages are prefixed with 'data: ' which is part of the protocol,
not the actual content from the LLM. This method removes that prefix
and returns the actual content.
Args:
chunk: The SSE chunk that may contain the 'data: ' prefix (string or bytes)
Returns:
The chunk with the 'data: ' prefix removed, or the original chunk
if no prefix was found. Returns None if input is None.
See OpenAI Python Ref for this: https://github.com/openai/openai-python/blob/041bf5a8ec54da19aad0169671793c2078bd6173/openai/api_requestor.py#L100
"""
if chunk is None:
return None
if isinstance(chunk, str):
# OpenAI sends `data: `
if chunk.startswith("data: "):
# Strip the prefix and any leading whitespace that might follow it
_length_of_sse_data_prefix = len("data: ")
return chunk[_length_of_sse_data_prefix:]
elif chunk.startswith("data:"):
# Sagemaker sends `data:`, no trailing whitespace
_length_of_sse_data_prefix = len("data:")
return chunk[_length_of_sse_data_prefix:]
return chunk
def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
"""Assume most recent usage chunk has total usage uptil then."""

View file

@ -26,7 +26,7 @@ else:
class AiohttpOpenAIChatConfig(OpenAILikeChatConfig):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -35,6 +35,8 @@ class AiohttpOpenAIChatConfig(OpenAILikeChatConfig):
Ensure - /v1/chat/completions is at the end of the url
"""
if api_base is None:
api_base = "https://api.openai.com"
if not api_base.endswith("/chat/completions"):
api_base += "/chat/completions"

View file

@ -34,7 +34,12 @@ from litellm.types.llms.openai import (
ChatCompletionToolCallChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import GenericStreamingChunk
from litellm.types.utils import (
Delta,
GenericStreamingChunk,
ModelResponseStream,
StreamingChoices,
)
from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
from ...base import BaseLLM
@ -469,7 +474,10 @@ class ModelResponseIterator:
if len(self.content_blocks) == 0:
return False
if self.content_blocks[0]["delta"]["type"] == "text_delta":
if (
self.content_blocks[0]["delta"]["type"] == "text_delta"
or self.content_blocks[0]["delta"]["type"] == "thinking_delta"
):
return False
for block in self.content_blocks:
@ -507,7 +515,12 @@ class ModelResponseIterator:
return usage_block
def _content_block_delta_helper(self, chunk: dict):
def _content_block_delta_helper(self, chunk: dict) -> Tuple[
str,
Optional[ChatCompletionToolCallChunk],
List[ChatCompletionThinkingBlock],
Dict[str, Any],
]:
"""
Helper function to handle the content block delta
"""
@ -516,6 +529,8 @@ class ModelResponseIterator:
tool_use: Optional[ChatCompletionToolCallChunk] = None
provider_specific_fields = {}
content_block = ContentBlockDelta(**chunk) # type: ignore
thinking_blocks: List[ChatCompletionThinkingBlock] = []
self.content_blocks.append(content_block)
if "text" in content_block["delta"]:
text = content_block["delta"]["text"]
@ -533,27 +548,43 @@ class ModelResponseIterator:
provider_specific_fields["citation"] = content_block["delta"]["citation"]
elif (
"thinking" in content_block["delta"]
or "signature_delta" == content_block["delta"]
or "signature" in content_block["delta"]
):
provider_specific_fields["thinking_blocks"] = [
thinking_blocks = [
ChatCompletionThinkingBlock(
type="thinking",
thinking=content_block["delta"].get("thinking"),
signature_delta=content_block["delta"].get("signature"),
thinking=content_block["delta"].get("thinking") or "",
signature=content_block["delta"].get("signature"),
)
]
return text, tool_use, provider_specific_fields
provider_specific_fields["thinking_blocks"] = thinking_blocks
return text, tool_use, thinking_blocks, provider_specific_fields
def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
def _handle_reasoning_content(
self, thinking_blocks: List[ChatCompletionThinkingBlock]
) -> Optional[str]:
"""
Handle the reasoning content
"""
reasoning_content = None
for block in thinking_blocks:
if reasoning_content is None:
reasoning_content = ""
if "thinking" in block:
reasoning_content += block["thinking"]
return reasoning_content
def chunk_parser(self, chunk: dict) -> ModelResponseStream:
try:
type_chunk = chunk.get("type", "") or ""
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
is_finished = False
finish_reason = ""
usage: Optional[ChatCompletionUsageBlock] = None
provider_specific_fields: Dict[str, Any] = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
index = int(chunk.get("index", 0))
if type_chunk == "content_block_delta":
@ -561,9 +592,13 @@ class ModelResponseIterator:
Anthropic content chunk
chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
"""
text, tool_use, provider_specific_fields = (
text, tool_use, thinking_blocks, provider_specific_fields = (
self._content_block_delta_helper(chunk=chunk)
)
if thinking_blocks:
reasoning_content = self._handle_reasoning_content(
thinking_blocks=thinking_blocks
)
elif type_chunk == "content_block_start":
"""
event: content_block_start
@ -585,9 +620,11 @@ class ModelResponseIterator:
"index": self.tool_index,
}
elif type_chunk == "content_block_stop":
ContentBlockStop(**chunk) # type: ignore
# check if tool call content block
is_empty = self.check_empty_tool_call_args()
if is_empty:
tool_use = {
"id": None,
@ -610,7 +647,6 @@ class ModelResponseIterator:
or "stop"
)
usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"])
is_finished = True
elif type_chunk == "message_start":
"""
Anthropic
@ -649,16 +685,27 @@ class ModelResponseIterator:
text, tool_use = self._handle_json_mode_chunk(text=text, tool_use=tool_use)
returned_chunk = GenericStreamingChunk(
text=text,
tool_use=tool_use,
is_finished=is_finished,
finish_reason=finish_reason,
returned_chunk = ModelResponseStream(
choices=[
StreamingChoices(
index=index,
delta=Delta(
content=text,
tool_calls=[tool_use] if tool_use is not None else None,
provider_specific_fields=(
provider_specific_fields
if provider_specific_fields
else None
),
thinking_blocks=(
thinking_blocks if thinking_blocks else None
),
reasoning_content=reasoning_content,
),
finish_reason=finish_reason,
)
],
usage=usage,
index=index,
provider_specific_fields=(
provider_specific_fields if provider_specific_fields else None
),
)
return returned_chunk
@ -769,7 +816,7 @@ class ModelResponseIterator:
except ValueError as e:
raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
def convert_str_chunk_to_generic_chunk(self, chunk: str) -> GenericStreamingChunk:
def convert_str_chunk_to_generic_chunk(self, chunk: str) -> ModelResponseStream:
"""
Convert a string chunk to a GenericStreamingChunk
@ -789,11 +836,4 @@ class ModelResponseIterator:
data_json = json.loads(str_line[5:])
return self.chunk_parser(chunk=data_json)
else:
return GenericStreamingChunk(
text="",
is_finished=False,
finish_reason="",
usage=None,
index=0,
tool_use=None,
)
return ModelResponseStream()

View file

@ -23,6 +23,7 @@ from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionCachedContent,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionToolParam,
@ -80,7 +81,7 @@ class AnthropicConfig(BaseConfig):
return super().get_config()
def get_supported_openai_params(self, model: str):
return [
params = [
"stream",
"stop",
"temperature",
@ -95,6 +96,11 @@ class AnthropicConfig(BaseConfig):
"user",
]
if "claude-3-7-sonnet" in model:
params.append("thinking")
return params
def get_json_schema_from_pydantic_object(
self, response_format: Union[Any, Dict, None]
) -> Optional[dict]:
@ -117,15 +123,16 @@ class AnthropicConfig(BaseConfig):
prompt_caching_set: bool = False,
pdf_used: bool = False,
is_vertex_request: bool = False,
user_anthropic_beta_headers: Optional[List[str]] = None,
) -> dict:
betas = []
betas = set()
if prompt_caching_set:
betas.append("prompt-caching-2024-07-31")
betas.add("prompt-caching-2024-07-31")
if computer_tool_used:
betas.append("computer-use-2024-10-22")
betas.add("computer-use-2024-10-22")
if pdf_used:
betas.append("pdfs-2024-09-25")
betas.add("pdfs-2024-09-25")
headers = {
"anthropic-version": anthropic_version or "2023-06-01",
"x-api-key": api_key,
@ -133,6 +140,9 @@ class AnthropicConfig(BaseConfig):
"content-type": "application/json",
}
if user_anthropic_beta_headers is not None:
betas.update(user_anthropic_beta_headers)
# Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
if is_vertex_request is True:
pass
@ -283,18 +293,6 @@ class AnthropicConfig(BaseConfig):
new_stop = new_v
return new_stop
def _add_tools_to_optional_params(
self, optional_params: dict, tools: List[AllAnthropicToolsValues]
) -> dict:
if "tools" not in optional_params:
optional_params["tools"] = tools
else:
optional_params["tools"] = [
*optional_params["tools"],
*tools,
]
return optional_params
def map_openai_params(
self,
non_default_params: dict,
@ -335,6 +333,10 @@ class AnthropicConfig(BaseConfig):
optional_params["top_p"] = value
if param == "response_format" and isinstance(value, dict):
ignore_response_format_types = ["text"]
if value["type"] in ignore_response_format_types: # value is a no-op
continue
json_schema: Optional[dict] = None
if "response_schema" in value:
json_schema = value["response_schema"]
@ -358,7 +360,8 @@ class AnthropicConfig(BaseConfig):
optional_params["json_mode"] = True
if param == "user":
optional_params["metadata"] = {"user_id": value}
if param == "thinking":
optional_params["thinking"] = value
return optional_params
def _create_json_tool_call_for_response_format(
@ -584,12 +587,14 @@ class AnthropicConfig(BaseConfig):
def extract_response_content(self, completion_response: dict) -> Tuple[
str,
Optional[List[Any]],
Optional[List[Dict[str, Any]]],
Optional[List[ChatCompletionThinkingBlock]],
Optional[str],
List[ChatCompletionToolCallChunk],
]:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[Dict[str, Any]]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []
for idx, content in enumerate(completion_response["content"]):
if content["type"] == "text":
@ -615,8 +620,13 @@ class AnthropicConfig(BaseConfig):
if content.get("thinking", None) is not None:
if thinking_blocks is None:
thinking_blocks = []
thinking_blocks.append(content)
return text_content, citations, thinking_blocks, tool_calls
thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
if thinking_blocks is not None:
reasoning_content = ""
for block in thinking_blocks:
if "thinking" in block:
reasoning_content += block["thinking"]
return text_content, citations, thinking_blocks, reasoning_content, tool_calls
def transform_response(
self,
@ -666,10 +676,11 @@ class AnthropicConfig(BaseConfig):
else:
text_content = ""
citations: Optional[List[Any]] = None
thinking_blocks: Optional[List[Dict[str, Any]]] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
reasoning_content: Optional[str] = None
tool_calls: List[ChatCompletionToolCallChunk] = []
text_content, citations, thinking_blocks, tool_calls = (
text_content, citations, thinking_blocks, reasoning_content, tool_calls = (
self.extract_response_content(completion_response=completion_response)
)
@ -680,6 +691,8 @@ class AnthropicConfig(BaseConfig):
"citations": citations,
"thinking_blocks": thinking_blocks,
},
thinking_blocks=thinking_blocks,
reasoning_content=reasoning_content,
)
## HANDLE JSON MODE - anthropic returns single function call
@ -774,6 +787,13 @@ class AnthropicConfig(BaseConfig):
headers=cast(httpx.Headers, headers),
)
def _get_user_anthropic_beta_headers(
self, anthropic_beta_header: Optional[str]
) -> Optional[List[str]]:
if anthropic_beta_header is None:
return None
return anthropic_beta_header.split(",")
def validate_environment(
self,
headers: dict,
@ -794,13 +814,18 @@ class AnthropicConfig(BaseConfig):
prompt_caching_set = self.is_cache_control_set(messages=messages)
computer_tool_used = self.is_computer_tool_used(tools=tools)
pdf_used = self.is_pdf_used(messages=messages)
user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
anthropic_beta_header=headers.get("anthropic-beta")
)
anthropic_headers = self.get_anthropic_headers(
computer_tool_used=computer_tool_used,
prompt_caching_set=prompt_caching_set,
pdf_used=pdf_used,
api_key=api_key,
is_vertex_request=optional_params.get("is_vertex_request", False),
user_anthropic_beta_headers=user_anthropic_beta_headers,
)
headers = {**headers, **anthropic_headers}
return headers

View file

@ -0,0 +1,179 @@
"""
- call /messages on Anthropic API
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Ensure requests are logged in the DB - stream + non-stream
"""
import json
from typing import Any, AsyncIterator, Dict, Optional, Union, cast
import httpx
import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.anthropic_messages.transformation import (
BaseAnthropicMessagesConfig,
)
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
get_async_httpx_client,
)
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import ProviderSpecificHeader
from litellm.utils import ProviderConfigManager, client
class AnthropicMessagesHandler:
@staticmethod
async def _handle_anthropic_streaming(
response: httpx.Response,
request_body: dict,
litellm_logging_obj: LiteLLMLoggingObj,
) -> AsyncIterator:
"""Helper function to handle Anthropic streaming responses using the existing logging handlers"""
from datetime import datetime
from litellm.proxy.pass_through_endpoints.streaming_handler import (
PassThroughStreamingHandler,
)
from litellm.proxy.pass_through_endpoints.success_handler import (
PassThroughEndpointLogging,
)
from litellm.proxy.pass_through_endpoints.types import EndpointType
# Create success handler object
passthrough_success_handler_obj = PassThroughEndpointLogging()
# Use the existing streaming handler for Anthropic
start_time = datetime.now()
return PassThroughStreamingHandler.chunk_processor(
response=response,
request_body=request_body,
litellm_logging_obj=litellm_logging_obj,
endpoint_type=EndpointType.ANTHROPIC,
start_time=start_time,
passthrough_success_handler_obj=passthrough_success_handler_obj,
url_route="/v1/messages",
)
@client
async def anthropic_messages(
api_key: str,
model: str,
stream: bool = False,
api_base: Optional[str] = None,
client: Optional[AsyncHTTPHandler] = None,
custom_llm_provider: Optional[str] = None,
**kwargs,
) -> Union[Dict[str, Any], AsyncIterator]:
"""
Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
"""
# Use provided client or create a new one
optional_params = GenericLiteLLMParams(**kwargs)
model, _custom_llm_provider, dynamic_api_key, dynamic_api_base = (
litellm.get_llm_provider(
model=model,
custom_llm_provider=custom_llm_provider,
api_base=optional_params.api_base,
api_key=optional_params.api_key,
)
)
anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = (
ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(_custom_llm_provider),
)
)
if anthropic_messages_provider_config is None:
raise ValueError(
f"Anthropic messages provider config not found for model: {model}"
)
if client is None or not isinstance(client, AsyncHTTPHandler):
async_httpx_client = get_async_httpx_client(
llm_provider=litellm.LlmProviders.ANTHROPIC
)
else:
async_httpx_client = client
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
# Prepare headers
provider_specific_header = cast(
Optional[ProviderSpecificHeader], kwargs.get("provider_specific_header", None)
)
extra_headers = (
provider_specific_header.get("extra_headers", {})
if provider_specific_header
else {}
)
headers = anthropic_messages_provider_config.validate_environment(
headers=extra_headers or {},
model=model,
api_key=api_key,
)
litellm_logging_obj.update_environment_variables(
model=model,
optional_params=dict(optional_params),
litellm_params={
"metadata": kwargs.get("metadata", {}),
"preset_cache_key": None,
"stream_response": {},
**optional_params.model_dump(exclude_unset=True),
},
custom_llm_provider=_custom_llm_provider,
)
litellm_logging_obj.model_call_details.update(kwargs)
# Prepare request body
request_body = kwargs.copy()
request_body = {
k: v
for k, v in request_body.items()
if k
in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
model=model
)
}
request_body["stream"] = stream
request_body["model"] = model
litellm_logging_obj.stream = stream
# Make the request
request_url = anthropic_messages_provider_config.get_complete_url(
api_base=api_base, model=model
)
litellm_logging_obj.pre_call(
input=[{"role": "user", "content": json.dumps(request_body)}],
api_key="",
additional_args={
"complete_input_dict": request_body,
"api_base": str(request_url),
"headers": headers,
},
)
response = await async_httpx_client.post(
url=request_url,
headers=headers,
data=json.dumps(request_body),
stream=stream,
)
response.raise_for_status()
# used for logging + cost tracking
litellm_logging_obj.model_call_details["httpx_response"] = response
if stream:
return await AnthropicMessagesHandler._handle_anthropic_streaming(
response=response,
request_body=request_body,
litellm_logging_obj=litellm_logging_obj,
)
else:
return response.json()

View file

@ -0,0 +1,47 @@
from typing import Optional
from litellm.llms.base_llm.anthropic_messages.transformation import (
BaseAnthropicMessagesConfig,
)
DEFAULT_ANTHROPIC_API_BASE = "https://api.anthropic.com"
DEFAULT_ANTHROPIC_API_VERSION = "2023-06-01"
class AnthropicMessagesConfig(BaseAnthropicMessagesConfig):
def get_supported_anthropic_messages_params(self, model: str) -> list:
return [
"messages",
"model",
"system",
"max_tokens",
"stop_sequences",
"temperature",
"top_p",
"top_k",
"tools",
"tool_choice",
"thinking",
# TODO: Add Anthropic `metadata` support
# "metadata",
]
def get_complete_url(self, api_base: Optional[str], model: str) -> str:
api_base = api_base or DEFAULT_ANTHROPIC_API_BASE
if not api_base.endswith("/v1/messages"):
api_base = f"{api_base}/v1/messages"
return api_base
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
) -> dict:
if "x-api-key" not in headers:
headers["x-api-key"] = api_key
if "anthropic-version" not in headers:
headers["anthropic-version"] = DEFAULT_ANTHROPIC_API_VERSION
if "content-type" not in headers:
headers["content-type"] = "application/json"
return headers

View file

@ -1,412 +0,0 @@
import json
from typing import List, Literal, Optional, Tuple, Union
from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice
from litellm.types.llms.anthropic import (
AllAnthropicToolsValues,
AnthopicMessagesAssistantMessageParam,
AnthropicFinishReason,
AnthropicMessagesRequest,
AnthropicMessagesToolChoice,
AnthropicMessagesUserMessageParam,
AnthropicResponse,
AnthropicResponseContentBlockText,
AnthropicResponseContentBlockToolUse,
AnthropicResponseUsageBlock,
ContentBlockDelta,
ContentJsonBlockDelta,
ContentTextBlockDelta,
MessageBlockDelta,
MessageDelta,
UsageDelta,
)
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionAssistantMessage,
ChatCompletionAssistantToolCall,
ChatCompletionImageObject,
ChatCompletionImageUrlObject,
ChatCompletionRequest,
ChatCompletionSystemMessage,
ChatCompletionTextObject,
ChatCompletionToolCallFunctionChunk,
ChatCompletionToolChoiceFunctionParam,
ChatCompletionToolChoiceObjectParam,
ChatCompletionToolChoiceValues,
ChatCompletionToolMessage,
ChatCompletionToolParam,
ChatCompletionToolParamFunctionChunk,
ChatCompletionUserMessage,
)
from litellm.types.utils import Choices, ModelResponse, Usage
class AnthropicExperimentalPassThroughConfig:
def __init__(self):
pass
### FOR [BETA] `/v1/messages` endpoint support
def translatable_anthropic_params(self) -> List:
"""
Which anthropic params, we need to translate to the openai format.
"""
return ["messages", "metadata", "system", "tool_choice", "tools"]
def translate_anthropic_messages_to_openai( # noqa: PLR0915
self,
messages: List[
Union[
AnthropicMessagesUserMessageParam,
AnthopicMessagesAssistantMessageParam,
]
],
) -> List:
new_messages: List[AllMessageValues] = []
for m in messages:
user_message: Optional[ChatCompletionUserMessage] = None
tool_message_list: List[ChatCompletionToolMessage] = []
new_user_content_list: List[
Union[ChatCompletionTextObject, ChatCompletionImageObject]
] = []
## USER MESSAGE ##
if m["role"] == "user":
## translate user message
message_content = m.get("content")
if message_content and isinstance(message_content, str):
user_message = ChatCompletionUserMessage(
role="user", content=message_content
)
elif message_content and isinstance(message_content, list):
for content in message_content:
if content["type"] == "text":
text_obj = ChatCompletionTextObject(
type="text", text=content["text"]
)
new_user_content_list.append(text_obj)
elif content["type"] == "image":
image_url = ChatCompletionImageUrlObject(
url=f"data:{content['type']};base64,{content['source']}"
)
image_obj = ChatCompletionImageObject(
type="image_url", image_url=image_url
)
new_user_content_list.append(image_obj)
elif content["type"] == "tool_result":
if "content" not in content:
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content="",
)
tool_message_list.append(tool_result)
elif isinstance(content["content"], str):
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=content["content"],
)
tool_message_list.append(tool_result)
elif isinstance(content["content"], list):
for c in content["content"]:
if c["type"] == "text":
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=c["text"],
)
tool_message_list.append(tool_result)
elif c["type"] == "image":
image_str = (
f"data:{c['type']};base64,{c['source']}"
)
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=image_str,
)
tool_message_list.append(tool_result)
if user_message is not None:
new_messages.append(user_message)
if len(new_user_content_list) > 0:
new_messages.append({"role": "user", "content": new_user_content_list}) # type: ignore
if len(tool_message_list) > 0:
new_messages.extend(tool_message_list)
## ASSISTANT MESSAGE ##
assistant_message_str: Optional[str] = None
tool_calls: List[ChatCompletionAssistantToolCall] = []
if m["role"] == "assistant":
if isinstance(m["content"], str):
assistant_message_str = m["content"]
elif isinstance(m["content"], list):
for content in m["content"]:
if content["type"] == "text":
if assistant_message_str is None:
assistant_message_str = content["text"]
else:
assistant_message_str += content["text"]
elif content["type"] == "tool_use":
function_chunk = ChatCompletionToolCallFunctionChunk(
name=content["name"],
arguments=json.dumps(content["input"]),
)
tool_calls.append(
ChatCompletionAssistantToolCall(
id=content["id"],
type="function",
function=function_chunk,
)
)
if assistant_message_str is not None or len(tool_calls) > 0:
assistant_message = ChatCompletionAssistantMessage(
role="assistant",
content=assistant_message_str,
)
if len(tool_calls) > 0:
assistant_message["tool_calls"] = tool_calls
new_messages.append(assistant_message)
return new_messages
def translate_anthropic_tool_choice_to_openai(
self, tool_choice: AnthropicMessagesToolChoice
) -> ChatCompletionToolChoiceValues:
if tool_choice["type"] == "any":
return "required"
elif tool_choice["type"] == "auto":
return "auto"
elif tool_choice["type"] == "tool":
tc_function_param = ChatCompletionToolChoiceFunctionParam(
name=tool_choice.get("name", "")
)
return ChatCompletionToolChoiceObjectParam(
type="function", function=tc_function_param
)
else:
raise ValueError(
"Incompatible tool choice param submitted - {}".format(tool_choice)
)
def translate_anthropic_tools_to_openai(
self, tools: List[AllAnthropicToolsValues]
) -> List[ChatCompletionToolParam]:
new_tools: List[ChatCompletionToolParam] = []
mapped_tool_params = ["name", "input_schema", "description"]
for tool in tools:
function_chunk = ChatCompletionToolParamFunctionChunk(
name=tool["name"],
)
if "input_schema" in tool:
function_chunk["parameters"] = tool["input_schema"] # type: ignore
if "description" in tool:
function_chunk["description"] = tool["description"] # type: ignore
for k, v in tool.items():
if k not in mapped_tool_params: # pass additional computer kwargs
function_chunk.setdefault("parameters", {}).update({k: v})
new_tools.append(
ChatCompletionToolParam(type="function", function=function_chunk)
)
return new_tools
def translate_anthropic_to_openai(
self, anthropic_message_request: AnthropicMessagesRequest
) -> ChatCompletionRequest:
"""
This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
"""
new_messages: List[AllMessageValues] = []
## CONVERT ANTHROPIC MESSAGES TO OPENAI
new_messages = self.translate_anthropic_messages_to_openai(
messages=anthropic_message_request["messages"]
)
## ADD SYSTEM MESSAGE TO MESSAGES
if "system" in anthropic_message_request:
new_messages.insert(
0,
ChatCompletionSystemMessage(
role="system", content=anthropic_message_request["system"]
),
)
new_kwargs: ChatCompletionRequest = {
"model": anthropic_message_request["model"],
"messages": new_messages,
}
## CONVERT METADATA (user_id)
if "metadata" in anthropic_message_request:
if "user_id" in anthropic_message_request["metadata"]:
new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
# Pass litellm proxy specific metadata
if "litellm_metadata" in anthropic_message_request:
# metadata will be passed to litellm.acompletion(), it's a litellm_param
new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
## CONVERT TOOL CHOICE
if "tool_choice" in anthropic_message_request:
new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
tool_choice=anthropic_message_request["tool_choice"]
)
## CONVERT TOOLS
if "tools" in anthropic_message_request:
new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
tools=anthropic_message_request["tools"]
)
translatable_params = self.translatable_anthropic_params()
for k, v in anthropic_message_request.items():
if k not in translatable_params: # pass remaining params as is
new_kwargs[k] = v # type: ignore
return new_kwargs
def _translate_openai_content_to_anthropic(
self, choices: List[Choices]
) -> List[
Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
]:
new_content: List[
Union[
AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
]
] = []
for choice in choices:
if (
choice.message.tool_calls is not None
and len(choice.message.tool_calls) > 0
):
for tool_call in choice.message.tool_calls:
new_content.append(
AnthropicResponseContentBlockToolUse(
type="tool_use",
id=tool_call.id,
name=tool_call.function.name or "",
input=json.loads(tool_call.function.arguments),
)
)
elif choice.message.content is not None:
new_content.append(
AnthropicResponseContentBlockText(
type="text", text=choice.message.content
)
)
return new_content
def _translate_openai_finish_reason_to_anthropic(
self, openai_finish_reason: str
) -> AnthropicFinishReason:
if openai_finish_reason == "stop":
return "end_turn"
elif openai_finish_reason == "length":
return "max_tokens"
elif openai_finish_reason == "tool_calls":
return "tool_use"
return "end_turn"
def translate_openai_response_to_anthropic(
self, response: ModelResponse
) -> AnthropicResponse:
## translate content block
anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices) # type: ignore
## extract finish reason
anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
openai_finish_reason=response.choices[0].finish_reason # type: ignore
)
# extract usage
usage: Usage = getattr(response, "usage")
anthropic_usage = AnthropicResponseUsageBlock(
input_tokens=usage.prompt_tokens or 0,
output_tokens=usage.completion_tokens or 0,
)
translated_obj = AnthropicResponse(
id=response.id,
type="message",
role="assistant",
model=response.model or "unknown-model",
stop_sequence=None,
usage=anthropic_usage,
content=anthropic_content,
stop_reason=anthropic_finish_reason,
)
return translated_obj
def _translate_streaming_openai_chunk_to_anthropic(
self, choices: List[OpenAIStreamingChoice]
) -> Tuple[
Literal["text_delta", "input_json_delta"],
Union[ContentTextBlockDelta, ContentJsonBlockDelta],
]:
text: str = ""
partial_json: Optional[str] = None
for choice in choices:
if choice.delta.content is not None:
text += choice.delta.content
elif choice.delta.tool_calls is not None:
partial_json = ""
for tool in choice.delta.tool_calls:
if (
tool.function is not None
and tool.function.arguments is not None
):
partial_json += tool.function.arguments
if partial_json is not None:
return "input_json_delta", ContentJsonBlockDelta(
type="input_json_delta", partial_json=partial_json
)
else:
return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
def translate_streaming_openai_response_to_anthropic(
self, response: ModelResponse
) -> Union[ContentBlockDelta, MessageBlockDelta]:
## base case - final chunk w/ finish reason
if response.choices[0].finish_reason is not None:
delta = MessageDelta(
stop_reason=self._translate_openai_finish_reason_to_anthropic(
response.choices[0].finish_reason
),
)
if getattr(response, "usage", None) is not None:
litellm_usage_chunk: Optional[Usage] = response.usage # type: ignore
elif (
hasattr(response, "_hidden_params")
and "usage" in response._hidden_params
):
litellm_usage_chunk = response._hidden_params["usage"]
else:
litellm_usage_chunk = None
if litellm_usage_chunk is not None:
usage_delta = UsageDelta(
input_tokens=litellm_usage_chunk.prompt_tokens or 0,
output_tokens=litellm_usage_chunk.completion_tokens or 0,
)
else:
usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
return MessageBlockDelta(
type="message_delta", delta=delta, usage=usage_delta
)
(
type_of_content,
content_block_delta,
) = self._translate_streaming_openai_chunk_to_anthropic(
choices=response.choices # type: ignore
)
return ContentBlockDelta(
type="content_block_delta",
index=response.choices[0].index,
delta=content_block_delta,
)

View file

@ -2,7 +2,7 @@
Azure Batches API Handler
"""
from typing import Any, Coroutine, Optional, Union
from typing import Any, Coroutine, Optional, Union, cast
import httpx
@ -14,6 +14,7 @@ from litellm.types.llms.openai import (
CreateBatchRequest,
RetrieveBatchRequest,
)
from litellm.types.utils import LiteLLMBatch
class AzureBatchesAPI:
@ -64,9 +65,9 @@ class AzureBatchesAPI:
self,
create_batch_data: CreateBatchRequest,
azure_client: AsyncAzureOpenAI,
) -> Batch:
) -> LiteLLMBatch:
response = await azure_client.batches.create(**create_batch_data)
return response
return LiteLLMBatch(**response.model_dump())
def create_batch(
self,
@ -78,7 +79,7 @@ class AzureBatchesAPI:
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
) -> Union[Batch, Coroutine[Any, Any, Batch]]:
) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
self.get_azure_openai_client(
api_key=api_key,
@ -103,16 +104,16 @@ class AzureBatchesAPI:
return self.acreate_batch( # type: ignore
create_batch_data=create_batch_data, azure_client=azure_client
)
response = azure_client.batches.create(**create_batch_data)
return response
response = cast(AzureOpenAI, azure_client).batches.create(**create_batch_data)
return LiteLLMBatch(**response.model_dump())
async def aretrieve_batch(
self,
retrieve_batch_data: RetrieveBatchRequest,
client: AsyncAzureOpenAI,
) -> Batch:
) -> LiteLLMBatch:
response = await client.batches.retrieve(**retrieve_batch_data)
return response
return LiteLLMBatch(**response.model_dump())
def retrieve_batch(
self,
@ -149,8 +150,10 @@ class AzureBatchesAPI:
return self.aretrieve_batch( # type: ignore
retrieve_batch_data=retrieve_batch_data, client=azure_client
)
response = azure_client.batches.retrieve(**retrieve_batch_data)
return response
response = cast(AzureOpenAI, azure_client).batches.retrieve(
**retrieve_batch_data
)
return LiteLLMBatch(**response.model_dump())
async def acancel_batch(
self,

View file

@ -1,4 +1,5 @@
from typing import Any, List, Optional, Tuple, cast
from urllib.parse import urlparse
import httpx
from httpx import Response
@ -28,16 +29,29 @@ class AzureAIStudioConfig(OpenAIConfig):
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
if api_base and "services.ai.azure.com" in api_base:
if api_base and self._should_use_api_key_header(api_base):
headers["api-key"] = api_key
else:
headers["Authorization"] = f"Bearer {api_key}"
return headers
def _should_use_api_key_header(self, api_base: str) -> bool:
"""
Returns True if the request should use `api-key` header for authentication.
"""
parsed_url = urlparse(api_base)
host = parsed_url.hostname
if host and (
host.endswith(".services.ai.azure.com")
or host.endswith(".openai.azure.com")
):
return True
return False
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -58,6 +72,10 @@ class AzureAIStudioConfig(OpenAIConfig):
- A complete URL string, e.g.,
"https://litellm8397336933.services.ai.azure.com/models/chat/completions?api-version=2024-05-01-preview"
"""
if api_base is None:
raise ValueError(
f"api_base is required for Azure AI Studio. Please set the api_base parameter. Passed `api_base={api_base}`"
)
original_url = httpx.URL(api_base)
# Extract api_version or use default

View file

@ -0,0 +1,35 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Optional
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
LiteLLMLoggingObj = _LiteLLMLoggingObj
else:
LiteLLMLoggingObj = Any
class BaseAnthropicMessagesConfig(ABC):
@abstractmethod
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
) -> dict:
pass
@abstractmethod
def get_complete_url(self, api_base: Optional[str], model: str) -> str:
"""
OPTIONAL
Get the complete url for the request
Some providers need `model` in `api_base`
"""
return api_base or ""
@abstractmethod
def get_supported_anthropic_messages_params(self, model: str) -> list:
pass

View file

@ -111,6 +111,19 @@ class BaseConfig(ABC):
"""
return False
def _add_tools_to_optional_params(self, optional_params: dict, tools: List) -> dict:
"""
Helper util to add tools to optional_params.
"""
if "tools" not in optional_params:
optional_params["tools"] = tools
else:
optional_params["tools"] = [
*optional_params["tools"],
*tools,
]
return optional_params
def translate_developer_role_to_system_role(
self,
messages: List[AllMessageValues],
@ -158,6 +171,7 @@ class BaseConfig(ABC):
optional_params: dict,
value: dict,
is_response_format_supported: bool,
enforce_tool_choice: bool = True,
) -> dict:
"""
Follow similar approach to anthropic - translate to a single tool call.
@ -195,9 +209,11 @@ class BaseConfig(ABC):
optional_params.setdefault("tools", [])
optional_params["tools"].append(_tool)
optional_params["tool_choice"] = _tool_choice
if enforce_tool_choice:
optional_params["tool_choice"] = _tool_choice
optional_params["json_mode"] = True
else:
elif is_response_format_supported:
optional_params["response_format"] = value
return optional_params
@ -249,7 +265,7 @@ class BaseConfig(ABC):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -261,6 +277,8 @@ class BaseConfig(ABC):
Some providers need `model` in `api_base`
"""
if api_base is None:
raise ValueError("api_base is required")
return api_base
@abstractmethod
@ -315,6 +333,7 @@ class BaseConfig(ABC):
data: dict,
messages: list,
client: Optional[AsyncHTTPHandler] = None,
json_mode: Optional[bool] = None,
) -> CustomStreamWrapper:
raise NotImplementedError
@ -328,6 +347,7 @@ class BaseConfig(ABC):
data: dict,
messages: list,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
json_mode: Optional[bool] = None,
) -> CustomStreamWrapper:
raise NotImplementedError

View file

@ -2,13 +2,14 @@ import hashlib
import json
import os
from datetime import datetime
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, cast, get_args
import httpx
from pydantic import BaseModel
from litellm._logging import verbose_logger
from litellm.caching.caching import DualCache
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.secret_managers.main import get_secret
@ -223,6 +224,60 @@ class BaseAWSLLM:
# Catch any unexpected errors and return None
return None
@staticmethod
def _get_provider_from_model_path(
model_path: str,
) -> Optional[BEDROCK_INVOKE_PROVIDERS_LITERAL]:
"""
Helper function to get the provider from a model path with format: provider/model-name
Args:
model_path (str): The model path (e.g., 'llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n' or 'anthropic/model-name')
Returns:
Optional[str]: The provider name, or None if no valid provider found
"""
parts = model_path.split("/")
if len(parts) >= 1:
provider = parts[0]
if provider in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL):
return cast(BEDROCK_INVOKE_PROVIDERS_LITERAL, provider)
return None
@staticmethod
def get_bedrock_invoke_provider(
model: str,
) -> Optional[BEDROCK_INVOKE_PROVIDERS_LITERAL]:
"""
Helper function to get the bedrock provider from the model
handles 3 scenarions:
1. model=invoke/anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
2. model=anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
3. model=llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n -> Returns `llama`
4. model=us.amazon.nova-pro-v1:0 -> Returns `nova`
"""
if model.startswith("invoke/"):
model = model.replace("invoke/", "", 1)
_split_model = model.split(".")[0]
if _split_model in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL):
return cast(BEDROCK_INVOKE_PROVIDERS_LITERAL, _split_model)
# If not a known provider, check for pattern with two slashes
provider = BaseAWSLLM._get_provider_from_model_path(model)
if provider is not None:
return provider
# check if provider == "nova"
if "nova" in model:
return "nova"
else:
for provider in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL):
if provider in model:
return provider
return None
def _get_aws_region_name(
self, optional_params: dict, model: Optional[str] = None
) -> str:
@ -499,6 +554,7 @@ class BaseAWSLLM:
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_session_token = optional_params.pop("aws_session_token", None)
aws_region_name = self._get_aws_region_name(optional_params, model)
optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_profile_name = optional_params.pop("aws_profile_name", None)

View file

@ -23,6 +23,7 @@ from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionResponseMessage,
ChatCompletionSystemMessage,
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionToolParam,
@ -116,6 +117,10 @@ class AmazonConverseConfig(BaseConfig):
# only anthropic and mistral support tool choice config. otherwise (E.g. cohere) will fail the call - https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolChoice.html
supported_params.append("tool_choice")
if (
"claude-3-7" in model
): # [TODO]: move to a 'supports_reasoning_content' param from model cost map
supported_params.append("thinking")
return supported_params
def map_tool_choice_values(
@ -162,6 +167,7 @@ class AmazonConverseConfig(BaseConfig):
self,
json_schema: Optional[dict] = None,
schema_name: str = "json_tool_call",
description: Optional[str] = None,
) -> ChatCompletionToolParam:
"""
Handles creating a tool call for getting responses in JSON format.
@ -184,11 +190,15 @@ class AmazonConverseConfig(BaseConfig):
else:
_input_schema = json_schema
tool_param_function_chunk = ChatCompletionToolParamFunctionChunk(
name=schema_name, parameters=_input_schema
)
if description:
tool_param_function_chunk["description"] = description
_tool = ChatCompletionToolParam(
type="function",
function=ChatCompletionToolParamFunctionChunk(
name=schema_name, parameters=_input_schema
),
function=tool_param_function_chunk,
)
return _tool
@ -201,15 +211,26 @@ class AmazonConverseConfig(BaseConfig):
messages: Optional[List[AllMessageValues]] = None,
) -> dict:
for param, value in non_default_params.items():
if param == "response_format":
if param == "response_format" and isinstance(value, dict):
ignore_response_format_types = ["text"]
if value["type"] in ignore_response_format_types: # value is a no-op
continue
json_schema: Optional[dict] = None
schema_name: str = ""
description: Optional[str] = None
if "response_schema" in value:
json_schema = value["response_schema"]
schema_name = "json_tool_call"
elif "json_schema" in value:
json_schema = value["json_schema"]["schema"]
schema_name = value["json_schema"]["name"]
description = value["json_schema"].get("description")
if "type" in value and value["type"] == "text":
continue
"""
Follow similar approach to anthropic - translate to a single tool call.
@ -218,12 +239,14 @@ class AmazonConverseConfig(BaseConfig):
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the models perspective.
"""
_tool_choice = {"name": schema_name, "type": "tool"}
_tool = self._create_json_tool_call_for_response_format(
json_schema=json_schema,
schema_name=schema_name if schema_name != "" else "json_tool_call",
description=description,
)
optional_params = self._add_tools_to_optional_params(
optional_params=optional_params, tools=[_tool]
)
optional_params["tools"] = [_tool]
if litellm.utils.supports_tool_choice(
model=model, custom_llm_provider=self.custom_llm_provider
):
@ -249,15 +272,18 @@ class AmazonConverseConfig(BaseConfig):
optional_params["temperature"] = value
if param == "top_p":
optional_params["topP"] = value
if param == "tools":
optional_params["tools"] = value
if param == "tools" and isinstance(value, list):
optional_params = self._add_tools_to_optional_params(
optional_params=optional_params, tools=value
)
if param == "tool_choice":
_tool_choice_value = self.map_tool_choice_values(
model=model, tool_choice=value, drop_params=drop_params # type: ignore
)
if _tool_choice_value is not None:
optional_params["tool_choice"] = _tool_choice_value
if param == "thinking":
optional_params["thinking"] = value
return optional_params
@overload
@ -545,6 +571,37 @@ class AmazonConverseConfig(BaseConfig):
encoding=encoding,
)
def _transform_reasoning_content(
self, reasoning_content_blocks: List[BedrockConverseReasoningContentBlock]
) -> str:
"""
Extract the reasoning text from the reasoning content blocks
Ensures deepseek reasoning content compatible output.
"""
reasoning_content_str = ""
for block in reasoning_content_blocks:
if "reasoningText" in block:
reasoning_content_str += block["reasoningText"]["text"]
return reasoning_content_str
def _transform_thinking_blocks(
self, thinking_blocks: List[BedrockConverseReasoningContentBlock]
) -> List[ChatCompletionThinkingBlock]:
"""Return a consistent format for thinking blocks between Anthropic and Bedrock."""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
for block in thinking_blocks:
if "reasoningText" in block:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
_text = block["reasoningText"].get("text")
_signature = block["reasoningText"].get("signature")
if _text is not None:
_thinking_block["thinking"] = _text
if _signature is not None:
_thinking_block["signature"] = _signature
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
def _transform_response(
self,
model: str,
@ -618,6 +675,10 @@ class AmazonConverseConfig(BaseConfig):
chat_completion_message: ChatCompletionResponseMessage = {"role": "assistant"}
content_str = ""
tools: List[ChatCompletionToolCallChunk] = []
reasoningContentBlocks: Optional[List[BedrockConverseReasoningContentBlock]] = (
None
)
if message is not None:
for idx, content in enumerate(message["content"]):
"""
@ -644,8 +705,22 @@ class AmazonConverseConfig(BaseConfig):
index=idx,
)
tools.append(_tool_response_chunk)
chat_completion_message["content"] = content_str
if "reasoningContent" in content:
if reasoningContentBlocks is None:
reasoningContentBlocks = []
reasoningContentBlocks.append(content["reasoningContent"])
if reasoningContentBlocks is not None:
chat_completion_message["provider_specific_fields"] = {
"reasoningContentBlocks": reasoningContentBlocks,
}
chat_completion_message["reasoning_content"] = (
self._transform_reasoning_content(reasoningContentBlocks)
)
chat_completion_message["thinking_blocks"] = (
self._transform_thinking_blocks(reasoningContentBlocks)
)
chat_completion_message["content"] = content_str
if json_mode is True and tools is not None and len(tools) == 1:
# to support 'json_schema' logic on bedrock models
json_mode_content_str: Optional[str] = tools[0]["function"].get("arguments")

View file

@ -26,7 +26,6 @@ import httpx # type: ignore
import litellm
from litellm import verbose_logger
from litellm._logging import print_verbose
from litellm.caching.caching import InMemoryCache
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.litellm_logging import Logging
@ -51,13 +50,19 @@ from litellm.llms.custom_httpx.http_handler import (
)
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import ChatCompletionMessageToolCall, Choices
from litellm.types.utils import ChatCompletionMessageToolCall, Choices, Delta
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import ModelResponse, ModelResponseStream, Usage
from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
StreamingChoices,
Usage,
)
from litellm.utils import CustomStreamWrapper, get_secret
from ..base_aws_llm import BaseAWSLLM
@ -212,7 +217,6 @@ async def make_call(
api_key="",
data=data,
messages=messages,
print_verbose=print_verbose,
encoding=litellm.encoding,
) # type: ignore
completion_stream: Any = MockResponseIterator(
@ -222,6 +226,7 @@ async def make_call(
decoder: AWSEventStreamDecoder = AmazonAnthropicClaudeStreamDecoder(
model=model,
sync_stream=False,
json_mode=json_mode,
)
completion_stream = decoder.aiter_bytes(
response.aiter_bytes(chunk_size=1024)
@ -298,7 +303,6 @@ def make_sync_call(
api_key="",
data=data,
messages=messages,
print_verbose=print_verbose,
encoding=litellm.encoding,
) # type: ignore
completion_stream: Any = MockResponseIterator(
@ -308,6 +312,7 @@ def make_sync_call(
decoder: AWSEventStreamDecoder = AmazonAnthropicClaudeStreamDecoder(
model=model,
sync_stream=True,
json_mode=json_mode,
)
completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
elif bedrock_invoke_provider == "deepseek_r1":
@ -525,7 +530,7 @@ class BedrockLLM(BaseAWSLLM):
].message.tool_calls:
_tool_call = {**tool_call.dict(), "index": 0}
_tool_calls.append(_tool_call)
delta_obj = litellm.utils.Delta(
delta_obj = Delta(
content=getattr(
model_response.choices[0].message, "content", None
),
@ -1146,27 +1151,6 @@ class BedrockLLM(BaseAWSLLM):
)
return streaming_response
@staticmethod
def get_bedrock_invoke_provider(
model: str,
) -> Optional[litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL]:
"""
Helper function to get the bedrock provider from the model
handles 2 scenarions:
1. model=anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
2. model=llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n -> Returns `llama`
"""
_split_model = model.split(".")[0]
if _split_model in get_args(litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL):
return cast(litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL, _split_model)
# If not a known provider, check for pattern with two slashes
provider = BedrockLLM._get_provider_from_model_path(model)
if provider is not None:
return provider
return None
@staticmethod
def _get_provider_from_model_path(
model_path: str,
@ -1258,14 +1242,40 @@ class AWSEventStreamDecoder:
return True
return False
def converse_chunk_parser(self, chunk_data: dict) -> GChunk:
def extract_reasoning_content_str(
self, reasoning_content_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[str]:
if "text" in reasoning_content_block:
return reasoning_content_block["text"]
return None
def translate_thinking_blocks(
self, thinking_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[List[ChatCompletionThinkingBlock]]:
"""
Translate the thinking blocks to a string
"""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
if "text" in thinking_block:
_thinking_block["thinking"] = thinking_block["text"]
elif "signature" in thinking_block:
_thinking_block["signature"] = thinking_block["signature"]
_thinking_block["thinking"] = "" # consistent with anthropic response
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
try:
verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data))
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
is_finished = False
finish_reason = ""
usage: Optional[ChatCompletionUsageBlock] = None
provider_specific_fields: dict = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
index = int(chunk_data.get("contentBlockIndex", 0))
if "start" in chunk_data:
@ -1305,6 +1315,22 @@ class AWSEventStreamDecoder:
},
"index": index,
}
elif "reasoningContent" in delta_obj:
provider_specific_fields = {
"reasoningContent": delta_obj["reasoningContent"],
}
reasoning_content = self.extract_reasoning_content_str(
delta_obj["reasoningContent"]
)
thinking_blocks = self.translate_thinking_blocks(
delta_obj["reasoningContent"]
)
if (
thinking_blocks
and len(thinking_blocks) > 0
and reasoning_content is None
):
reasoning_content = "" # set to non-empty string to ensure consistency with Anthropic
elif (
"contentBlockIndex" in chunk_data
): # stop block, no 'start' or 'delta' object
@ -1321,7 +1347,6 @@ class AWSEventStreamDecoder:
}
elif "stopReason" in chunk_data:
finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
is_finished = True
elif "usage" in chunk_data:
usage = ChatCompletionUsageBlock(
prompt_tokens=chunk_data.get("inputTokens", 0),
@ -1329,18 +1354,33 @@ class AWSEventStreamDecoder:
total_tokens=chunk_data.get("totalTokens", 0),
)
response = GChunk(
text=text,
tool_use=tool_use,
is_finished=is_finished,
finish_reason=finish_reason,
usage=usage,
index=index,
)
model_response_provider_specific_fields = {}
if "trace" in chunk_data:
trace = chunk_data.get("trace")
response["provider_specific_fields"] = {"trace": trace}
model_response_provider_specific_fields["trace"] = trace
response = ModelResponseStream(
choices=[
StreamingChoices(
finish_reason=finish_reason,
index=index,
delta=Delta(
content=text,
role="assistant",
tool_calls=[tool_use] if tool_use else None,
provider_specific_fields=(
provider_specific_fields
if provider_specific_fields
else None
),
thinking_blocks=thinking_blocks,
reasoning_content=reasoning_content,
),
)
],
usage=usage,
provider_specific_fields=model_response_provider_specific_fields,
)
return response
except Exception as e:
raise Exception("Received streaming error - {}".format(str(e)))
@ -1474,6 +1514,7 @@ class AmazonAnthropicClaudeStreamDecoder(AWSEventStreamDecoder):
self,
model: str,
sync_stream: bool,
json_mode: Optional[bool] = None,
) -> None:
"""
Child class of AWSEventStreamDecoder that handles the streaming response from the Anthropic family of models
@ -1484,9 +1525,10 @@ class AmazonAnthropicClaudeStreamDecoder(AWSEventStreamDecoder):
self.anthropic_model_response_iterator = AnthropicModelResponseIterator(
streaming_response=None,
sync_stream=sync_stream,
json_mode=json_mode,
)
def _chunk_parser(self, chunk_data: dict) -> GChunk:
def _chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
return self.anthropic_model_response_iterator.chunk_parser(chunk=chunk_data)

View file

@ -3,8 +3,10 @@ from typing import Optional
import litellm
from .base_invoke_transformation import AmazonInvokeConfig
class AmazonAnthropicConfig:
class AmazonAnthropicConfig(AmazonInvokeConfig):
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
@ -57,9 +59,7 @@ class AmazonAnthropicConfig:
and v is not None
}
def get_supported_openai_params(
self,
):
def get_supported_openai_params(self, model: str):
return [
"max_tokens",
"max_completion_tokens",
@ -69,7 +69,13 @@ class AmazonAnthropicConfig:
"stream",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
):
for param, value in non_default_params.items():
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens_to_sample"] = value

View file

@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Optional
import httpx
import litellm
from litellm.llms.anthropic.chat.transformation import AnthropicConfig
from litellm.llms.bedrock.chat.invoke_transformations.base_invoke_transformation import (
AmazonInvokeConfig,
)
@ -17,7 +17,7 @@ else:
LiteLLMLoggingObj = Any
class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
class AmazonAnthropicClaude3Config(AmazonInvokeConfig, AnthropicConfig):
"""
Reference:
https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
@ -28,18 +28,8 @@ class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
anthropic_version: str = "bedrock-2023-05-31"
def get_supported_openai_params(self, model: str):
return [
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"stream",
"stop",
"temperature",
"top_p",
"extra_headers",
]
def get_supported_openai_params(self, model: str) -> List[str]:
return AnthropicConfig.get_supported_openai_params(self, model)
def map_openai_params(
self,
@ -47,21 +37,14 @@ class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
optional_params: dict,
model: str,
drop_params: bool,
):
for param, value in non_default_params.items():
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "tools":
optional_params["tools"] = value
if param == "stream":
optional_params["stream"] = value
if param == "stop":
optional_params["stop_sequences"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
return optional_params
) -> dict:
return AnthropicConfig.map_openai_params(
self,
non_default_params,
optional_params,
model,
drop_params,
)
def transform_request(
self,
@ -71,7 +54,8 @@ class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
litellm_params: dict,
headers: dict,
) -> dict:
_anthropic_request = litellm.AnthropicConfig().transform_request(
_anthropic_request = AnthropicConfig.transform_request(
self,
model=model,
messages=messages,
optional_params=optional_params,
@ -80,6 +64,7 @@ class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
)
_anthropic_request.pop("model", None)
_anthropic_request.pop("stream", None)
if "anthropic_version" not in _anthropic_request:
_anthropic_request["anthropic_version"] = self.anthropic_version
@ -99,7 +84,8 @@ class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
api_key: Optional[str] = None,
json_mode: Optional[bool] = None,
) -> ModelResponse:
return litellm.AnthropicConfig().transform_response(
return AnthropicConfig.transform_response(
self,
model=model,
raw_response=raw_response,
model_response=model_response,

View file

@ -73,7 +73,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -461,6 +461,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
data: dict,
messages: list,
client: Optional[AsyncHTTPHandler] = None,
json_mode: Optional[bool] = None,
) -> CustomStreamWrapper:
streaming_response = CustomStreamWrapper(
completion_stream=None,
@ -475,6 +476,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
logging_obj=logging_obj,
fake_stream=True if "ai21" in api_base else False,
bedrock_invoke_provider=self.get_bedrock_invoke_provider(model),
json_mode=json_mode,
),
model=model,
custom_llm_provider="bedrock",
@ -493,6 +495,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
data: dict,
messages: list,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
json_mode: Optional[bool] = None,
) -> CustomStreamWrapper:
if client is None or isinstance(client, AsyncHTTPHandler):
client = _get_httpx_client(params={})
@ -509,6 +512,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
logging_obj=logging_obj,
fake_stream=True if "ai21" in api_base else False,
bedrock_invoke_provider=self.get_bedrock_invoke_provider(model),
json_mode=json_mode,
),
model=model,
custom_llm_provider="bedrock",
@ -534,7 +538,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
"""
Helper function to get the bedrock provider from the model
handles 3 scenarions:
handles 4 scenarios:
1. model=invoke/anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
2. model=anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
3. model=llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n -> Returns `llama`
@ -555,6 +559,10 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
# check if provider == "nova"
if "nova" in model:
return "nova"
for provider in get_args(litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL):
if provider in model:
return provider
return None
@staticmethod

View file

@ -10,6 +10,8 @@ import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
_get_httpx_client,
get_async_httpx_client,
)
@ -51,6 +53,7 @@ class BedrockImageGeneration(BaseAWSLLM):
aimg_generation: bool = False,
api_base: Optional[str] = None,
extra_headers: Optional[dict] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
prepared_request = self._prepare_request(
model=model,
@ -69,9 +72,15 @@ class BedrockImageGeneration(BaseAWSLLM):
logging_obj=logging_obj,
prompt=prompt,
model_response=model_response,
client=(
client
if client is not None and isinstance(client, AsyncHTTPHandler)
else None
),
)
client = _get_httpx_client()
if client is None or not isinstance(client, HTTPHandler):
client = _get_httpx_client()
try:
response = client.post(url=prepared_request.endpoint_url, headers=prepared_request.prepped.headers, data=prepared_request.body) # type: ignore
response.raise_for_status()
@ -99,13 +108,14 @@ class BedrockImageGeneration(BaseAWSLLM):
logging_obj: LitellmLogging,
prompt: str,
model_response: ImageResponse,
client: Optional[AsyncHTTPHandler] = None,
) -> ImageResponse:
"""
Asynchronous handler for bedrock image generation
Awaits the response from the bedrock image generation endpoint
"""
async_client = get_async_httpx_client(
async_client = client or get_async_httpx_client(
llm_provider=litellm.LlmProviders.BEDROCK,
params={"timeout": timeout},
)

View file

@ -11,6 +11,7 @@ from litellm.llms.base_llm.chat.transformation import (
BaseLLMException,
LiteLLMLoggingObj,
)
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import AllMessageValues
from litellm.types.utils import (
ChatCompletionToolCallChunk,
@ -75,11 +76,16 @@ class CloudflareChatConfig(BaseConfig):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
) -> str:
if api_base is None:
account_id = get_secret_str("CLOUDFLARE_ACCOUNT_ID")
api_base = (
f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/"
)
return api_base + model
def get_supported_openai_params(self, model: str) -> List[str]:

View file

@ -84,7 +84,9 @@ class CodestralTextCompletionConfig(OpenAITextCompletionConfig):
finish_reason = None
logprobs = None
chunk_data = chunk_data.replace("data:", "")
chunk_data = (
litellm.CustomStreamWrapper._strip_sse_data_from_chunk(chunk_data) or ""
)
chunk_data = chunk_data.strip()
if len(chunk_data) == 0 or chunk_data == "[DONE]":
return {

View file

@ -159,6 +159,7 @@ class BaseLLMHTTPHandler:
encoding: Any,
api_key: Optional[str] = None,
client: Optional[AsyncHTTPHandler] = None,
json_mode: bool = False,
):
if client is None:
async_httpx_client = get_async_httpx_client(
@ -190,6 +191,7 @@ class BaseLLMHTTPHandler:
optional_params=optional_params,
litellm_params=litellm_params,
encoding=encoding,
json_mode=json_mode,
)
def completion(
@ -211,6 +213,7 @@ class BaseLLMHTTPHandler:
headers: Optional[dict] = {},
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
json_mode: bool = optional_params.pop("json_mode", False)
provider_config = ProviderConfigManager.get_provider_chat_config(
model=model, provider=litellm.LlmProviders(custom_llm_provider)
@ -286,6 +289,7 @@ class BaseLLMHTTPHandler:
else None
),
litellm_params=litellm_params,
json_mode=json_mode,
)
else:
@ -309,6 +313,7 @@ class BaseLLMHTTPHandler:
if client is not None and isinstance(client, AsyncHTTPHandler)
else None
),
json_mode=json_mode,
)
if stream is True:
@ -327,6 +332,7 @@ class BaseLLMHTTPHandler:
data=data,
messages=messages,
client=client,
json_mode=json_mode,
)
completion_stream, headers = self.make_sync_call(
provider_config=provider_config,
@ -380,6 +386,7 @@ class BaseLLMHTTPHandler:
optional_params=optional_params,
litellm_params=litellm_params,
encoding=encoding,
json_mode=json_mode,
)
def make_sync_call(
@ -453,6 +460,7 @@ class BaseLLMHTTPHandler:
litellm_params: dict,
fake_stream: bool = False,
client: Optional[AsyncHTTPHandler] = None,
json_mode: Optional[bool] = None,
):
if provider_config.has_custom_stream_wrapper is True:
return provider_config.get_async_custom_stream_wrapper(
@ -464,6 +472,7 @@ class BaseLLMHTTPHandler:
data=data,
messages=messages,
client=client,
json_mode=json_mode,
)
completion_stream, _response_headers = await self.make_async_call_stream_helper(
@ -720,7 +729,7 @@ class BaseLLMHTTPHandler:
api_base: Optional[str] = None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
) -> RerankResponse:
# get config from model, custom llm provider
headers = provider_config.validate_environment(
api_key=api_key,
@ -864,7 +873,7 @@ class BaseLLMHTTPHandler:
elif isinstance(audio_file, bytes):
# Assume it's already binary data
binary_data = audio_file
elif isinstance(audio_file, io.BufferedReader):
elif isinstance(audio_file, io.BufferedReader) or isinstance(audio_file, io.BytesIO):
# Handle file-like objects
binary_data = audio_file.read()

View file

@ -89,7 +89,7 @@ class ModelResponseIterator:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
chunk = chunk.replace("data:", "")
chunk = litellm.CustomStreamWrapper._strip_sse_data_from_chunk(chunk) or ""
chunk = chunk.strip()
if len(chunk) > 0:
json_chunk = json.loads(chunk)
@ -134,7 +134,7 @@ class ModelResponseIterator:
raise RuntimeError(f"Error receiving chunk from stream: {e}")
try:
chunk = chunk.replace("data:", "")
chunk = litellm.CustomStreamWrapper._strip_sse_data_from_chunk(chunk) or ""
chunk = chunk.strip()
if chunk == "[DONE]":
raise StopAsyncIteration

View file

@ -34,3 +34,21 @@ class DeepSeekChatConfig(OpenAIGPTConfig):
) # type: ignore
dynamic_api_key = api_key or get_secret_str("DEEPSEEK_API_KEY")
return api_base, dynamic_api_key
def get_complete_url(
self,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
If api_base is not provided, use the default DeepSeek /chat/completions endpoint.
"""
if not api_base:
api_base = "https://api.deepseek.com/beta"
if not api_base.endswith("/chat/completions"):
api_base = f"{api_base}/chat/completions"
return api_base

View file

@ -90,6 +90,11 @@ class FireworksAIConfig(OpenAIGPTConfig):
) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model)
is_tools_set = any(
param == "tools" and value is not None
for param, value in non_default_params.items()
)
for param, value in non_default_params.items():
if param == "tool_choice":
if value == "required":
@ -98,18 +103,30 @@ class FireworksAIConfig(OpenAIGPTConfig):
else:
# pass through the value of tool choice
optional_params["tool_choice"] = value
elif (
param == "response_format" and value.get("type", None) == "json_schema"
):
optional_params["response_format"] = {
"type": "json_object",
"schema": value["json_schema"]["schema"],
}
elif param == "response_format":
if (
is_tools_set
): # fireworks ai doesn't support tools and response_format together
optional_params = self._add_response_format_to_tools(
optional_params=optional_params,
value=value,
is_response_format_supported=False,
enforce_tool_choice=False, # tools and response_format are both set, don't enforce tool_choice
)
elif "json_schema" in value:
optional_params["response_format"] = {
"type": "json_object",
"schema": value["json_schema"]["schema"],
}
else:
optional_params["response_format"] = value
elif param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
if value is not None:
optional_params[param] = value
return optional_params
def _add_transform_inline_image_block(

View file

@ -114,12 +114,16 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
if element.get("type") == "image_url":
img_element = element
_image_url: Optional[str] = None
format: Optional[str] = None
if isinstance(img_element.get("image_url"), dict):
_image_url = img_element["image_url"].get("url") # type: ignore
format = img_element["image_url"].get("format") # type: ignore
else:
_image_url = img_element.get("image_url") # type: ignore
if _image_url and "https://" in _image_url:
image_obj = convert_to_anthropic_image_obj(_image_url)
image_obj = convert_to_anthropic_image_obj(
_image_url, format=format
)
img_element["image_url"] = ( # type: ignore
convert_generic_image_chunk_to_openai_image_obj(
image_obj

View file

@ -353,7 +353,7 @@ class OllamaConfig(BaseConfig):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -365,6 +365,8 @@ class OllamaConfig(BaseConfig):
Some providers need `model` in `api_base`
"""
if api_base is None:
api_base = "http://localhost:11434"
if api_base.endswith("/api/generate"):
url = api_base
else:

View file

@ -1,7 +1,7 @@
import json
import time
import uuid
from typing import Any, List, Optional
from typing import Any, List, Optional, Union
import aiohttp
import httpx
@ -9,7 +9,11 @@ from pydantic import BaseModel
import litellm
from litellm import verbose_logger
from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
get_async_httpx_client,
)
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
from litellm.types.llms.ollama import OllamaToolCall, OllamaToolCallFunction
from litellm.types.llms.openai import ChatCompletionAssistantToolCall
@ -205,6 +209,7 @@ def get_ollama_response( # noqa: PLR0915
api_key: Optional[str] = None,
acompletion: bool = False,
encoding=None,
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
if api_base.endswith("/api/chat"):
url = api_base
@ -301,7 +306,11 @@ def get_ollama_response( # noqa: PLR0915
headers: Optional[dict] = None
if api_key is not None:
headers = {"Authorization": "Bearer {}".format(api_key)}
response = litellm.module_level_client.post(
sync_client = litellm.module_level_client
if client is not None and isinstance(client, HTTPHandler):
sync_client = client
response = sync_client.post(
url=url,
json=data,
headers=headers,
@ -508,6 +517,7 @@ async def ollama_async_streaming(
verbose_logger.exception(
"LiteLLM.ollama(): Exception occured - {}".format(str(e))
)
raise e
async def ollama_acompletion(

View file

@ -20,7 +20,11 @@ from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import AllMessageValues
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionImageObject,
ChatCompletionImageUrlObject,
)
from litellm.types.utils import ModelResponse, ModelResponseStream
from litellm.utils import convert_to_model_response_object
@ -178,6 +182,27 @@ class OpenAIGPTConfig(BaseLLMModelInfo, BaseConfig):
def _transform_messages(
self, messages: List[AllMessageValues], model: str
) -> List[AllMessageValues]:
"""OpenAI no longer supports image_url as a string, so we need to convert it to a dict"""
for message in messages:
message_content = message.get("content")
if message_content and isinstance(message_content, list):
for content_item in message_content:
if content_item.get("type") == "image_url":
content_item = cast(ChatCompletionImageObject, content_item)
if isinstance(content_item["image_url"], str):
content_item["image_url"] = {
"url": content_item["image_url"],
}
elif isinstance(content_item["image_url"], dict):
litellm_specific_params = {"format"}
new_image_url_obj = ChatCompletionImageUrlObject(
**{ # type: ignore
k: v
for k, v in content_item["image_url"].items()
if k not in litellm_specific_params
}
)
content_item["image_url"] = new_image_url_obj
return messages
def transform_request(
@ -263,7 +288,7 @@ class OpenAIGPTConfig(BaseLLMModelInfo, BaseConfig):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,
@ -274,6 +299,8 @@ class OpenAIGPTConfig(BaseLLMModelInfo, BaseConfig):
Returns:
str: The complete URL for the API call.
"""
if api_base is None:
api_base = "https://api.openai.com"
endpoint = "chat/completions"
# Remove trailing slash from api_base if present

View file

@ -19,6 +19,7 @@ from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage
from litellm.utils import (
supports_function_calling,
supports_parallel_function_calling,
supports_response_schema,
supports_system_messages,
)
@ -76,14 +77,19 @@ class OpenAIOSeriesConfig(OpenAIGPTConfig):
model, custom_llm_provider
)
_supports_response_schema = supports_response_schema(model, custom_llm_provider)
_supports_parallel_tool_calls = supports_parallel_function_calling(
model, custom_llm_provider
)
if not _supports_function_calling:
non_supported_params.append("tools")
non_supported_params.append("tool_choice")
non_supported_params.append("parallel_tool_calls")
non_supported_params.append("function_call")
non_supported_params.append("functions")
if not _supports_parallel_tool_calls:
non_supported_params.append("parallel_tool_calls")
if not _supports_response_schema:
non_supported_params.append("response_format")
@ -146,4 +152,5 @@ class OpenAIOSeriesConfig(OpenAIGPTConfig):
)
messages[i] = new_message # Replace the old message with the new one
messages = super()._transform_messages(messages, model)
return messages

View file

@ -37,6 +37,7 @@ from litellm.llms.custom_httpx.http_handler import _DEFAULT_TTL_FOR_HTTPX_CLIENT
from litellm.types.utils import (
EmbeddingResponse,
ImageResponse,
LiteLLMBatch,
ModelResponse,
ModelResponseStream,
)
@ -1755,9 +1756,9 @@ class OpenAIBatchesAPI(BaseLLM):
self,
create_batch_data: CreateBatchRequest,
openai_client: AsyncOpenAI,
) -> Batch:
) -> LiteLLMBatch:
response = await openai_client.batches.create(**create_batch_data)
return response
return LiteLLMBatch(**response.model_dump())
def create_batch(
self,
@ -1769,7 +1770,7 @@ class OpenAIBatchesAPI(BaseLLM):
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
) -> Union[Batch, Coroutine[Any, Any, Batch]]:
) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
@ -1792,17 +1793,18 @@ class OpenAIBatchesAPI(BaseLLM):
return self.acreate_batch( # type: ignore
create_batch_data=create_batch_data, openai_client=openai_client
)
response = openai_client.batches.create(**create_batch_data)
return response
response = cast(OpenAI, openai_client).batches.create(**create_batch_data)
return LiteLLMBatch(**response.model_dump())
async def aretrieve_batch(
self,
retrieve_batch_data: RetrieveBatchRequest,
openai_client: AsyncOpenAI,
) -> Batch:
) -> LiteLLMBatch:
verbose_logger.debug("retrieving batch, args= %s", retrieve_batch_data)
response = await openai_client.batches.retrieve(**retrieve_batch_data)
return response
return LiteLLMBatch(**response.model_dump())
def retrieve_batch(
self,
@ -1837,8 +1839,8 @@ class OpenAIBatchesAPI(BaseLLM):
return self.aretrieve_batch( # type: ignore
retrieve_batch_data=retrieve_batch_data, openai_client=openai_client
)
response = openai_client.batches.retrieve(**retrieve_batch_data)
return response
response = cast(OpenAI, openai_client).batches.retrieve(**retrieve_batch_data)
return LiteLLMBatch(**response.model_dump())
async def acancel_batch(
self,

View file

@ -6,7 +6,16 @@ Calls done in OpenAI/openai.py as OpenRouter is openai-compatible.
Docs: https://openrouter.ai/docs/parameters
"""
from typing import Any, AsyncIterator, Iterator, Optional, Union
import httpx
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.types.utils import ModelResponse, ModelResponseStream
from ...openai.chat.gpt_transformation import OpenAIGPTConfig
from ..common_utils import OpenRouterException
class OpenrouterConfig(OpenAIGPTConfig):
@ -37,3 +46,43 @@ class OpenrouterConfig(OpenAIGPTConfig):
extra_body # openai client supports `extra_body` param
)
return mapped_openai_params
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:
return OpenRouterException(
message=error_message,
status_code=status_code,
headers=headers,
)
def get_model_response_iterator(
self,
streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
sync_stream: bool,
json_mode: Optional[bool] = False,
) -> Any:
return OpenRouterChatCompletionStreamingHandler(
streaming_response=streaming_response,
sync_stream=sync_stream,
json_mode=json_mode,
)
class OpenRouterChatCompletionStreamingHandler(BaseModelResponseIterator):
def chunk_parser(self, chunk: dict) -> ModelResponseStream:
try:
new_choices = []
for choice in chunk["choices"]:
choice["delta"]["reasoning_content"] = choice["delta"].get("reasoning")
new_choices.append(choice)
return ModelResponseStream(
id=chunk["id"],
object="chat.completion.chunk",
created=chunk["created"],
model=chunk["model"],
choices=new_choices,
)
except Exception as e:
raise e

View file

@ -0,0 +1,5 @@
from litellm.llms.base_llm.chat.transformation import BaseLLMException
class OpenRouterException(BaseLLMException):
pass

View file

@ -138,7 +138,7 @@ class ReplicateConfig(BaseConfig):
def get_complete_url(
self,
api_base: str,
api_base: Optional[str],
model: str,
optional_params: dict,
stream: Optional[bool] = None,

View file

@ -3,6 +3,7 @@ from typing import AsyncIterator, Iterator, List, Optional, Union
import httpx
import litellm
from litellm import verbose_logger
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.types.utils import GenericStreamingChunk as GChunk
@ -78,7 +79,11 @@ class AWSEventStreamDecoder:
message = self._parse_message_from_event(event)
if message:
# remove data: prefix and "\n\n" at the end
message = message.replace("data:", "").replace("\n\n", "")
message = (
litellm.CustomStreamWrapper._strip_sse_data_from_chunk(message)
or ""
)
message = message.replace("\n\n", "")
# Accumulate JSON data
accumulated_json += message
@ -127,7 +132,11 @@ class AWSEventStreamDecoder:
if message:
verbose_logger.debug("sagemaker parsed chunk bytes %s", message)
# remove data: prefix and "\n\n" at the end
message = message.replace("data:", "").replace("\n\n", "")
message = (
litellm.CustomStreamWrapper._strip_sse_data_from_chunk(message)
or ""
)
message = message.replace("\n\n", "")
# Accumulate JSON data
accumulated_json += message

View file

@ -433,6 +433,10 @@ class SagemakerLLM(BaseAWSLLM):
"messages": messages,
}
prepared_request = await asyncified_prepare_request(**prepared_request_args)
if model_id is not None: # Fixes https://github.com/BerriAI/litellm/issues/8889
prepared_request.headers.update(
{"X-Amzn-SageMaker-Inference-Component": model_id}
)
completion_stream = await self.make_async_call(
api_base=prepared_request.url,
headers=prepared_request.headers, # type: ignore
@ -511,7 +515,7 @@ class SagemakerLLM(BaseAWSLLM):
# Add model_id as InferenceComponentName header
# boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
prepared_request.headers.update(
{"X-Amzn-SageMaker-Inference-Componen": model_id}
{"X-Amzn-SageMaker-Inference-Component": model_id}
)
# make async httpx post request here
try:

View file

@ -11,7 +11,7 @@ from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
class SambanovaConfig(OpenAIGPTConfig):
"""
Reference: https://community.sambanova.ai/t/create-chat-completion-api/
Reference: https://docs.sambanova.ai/cloud/api-reference/
Below are the parameters:
"""

Some files were not shown because too many files have changed in this diff Show more