Merge branch 'main' into feat/friendliai

This commit is contained in:
Wonseok Lee (Jack) 2024-06-21 10:50:03 +09:00 committed by GitHub
commit c4c7d1b367
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
201 changed files with 22438 additions and 13694 deletions

View file

@ -65,6 +65,7 @@ jobs:
pip install "pydantic==2.7.1"
pip install "diskcache==5.6.1"
pip install "Pillow==10.3.0"
pip install "ijson==3.2.3"
- save_cache:
paths:
- ./venv
@ -126,6 +127,7 @@ jobs:
pip install jinja2
pip install tokenizers
pip install openai
pip install ijson
- run:
name: Run tests
command: |
@ -180,6 +182,7 @@ jobs:
pip install numpydoc
pip install prisma
pip install fastapi
pip install ijson
pip install "httpx==0.24.1"
pip install "gunicorn==21.2.0"
pip install "anyio==3.7.1"

10
.github/dependabot.yaml vendored Normal file
View file

@ -0,0 +1,10 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
groups:
github-actions:
patterns:
- "*"

View file

@ -25,6 +25,11 @@ jobs:
if: github.repository == 'BerriAI/litellm'
runs-on: ubuntu-latest
steps:
-
name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_hash }}
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
@ -41,12 +46,14 @@ jobs:
name: Build and push
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }}
-
name: Build and push litellm-database image
uses: docker/build-push-action@v5
with:
context: .
push: true
file: Dockerfile.database
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
@ -54,6 +61,7 @@ jobs:
name: Build and push litellm-spend-logs image
uses: docker/build-push-action@v5
with:
context: .
push: true
file: ./litellm-js/spend-logs/Dockerfile
tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
@ -68,6 +76,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_hash }}
# Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -92,7 +102,7 @@ jobs:
- name: Build and push Docker image
uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8
with:
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
labels: ${{ steps.meta.outputs.labels }}
@ -106,6 +116,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_hash }}
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -128,7 +140,7 @@ jobs:
- name: Build and push Database Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
context: .
file: Dockerfile.database
push: true
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
@ -143,6 +155,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_hash }}
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -165,7 +179,7 @@ jobs:
- name: Build and push Database Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
context: .
file: ./litellm-js/spend-logs/Dockerfile
push: true
tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
@ -176,6 +190,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_hash }}
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1

1
.gitignore vendored
View file

@ -60,3 +60,4 @@ litellm/proxy/_experimental/out/404/index.html
litellm/proxy/_experimental/out/model_hub/index.html
litellm/proxy/_experimental/out/onboarding/index.html
litellm/tests/log.txt
litellm/tests/langfuse.log

View file

@ -1,4 +1,19 @@
repos:
- repo: local
hooks:
- id: mypy
name: mypy
entry: python3 -m mypy --ignore-missing-imports
language: system
types: [python]
files: ^litellm/
- id: isort
name: isort
entry: isort
language: system
types: [python]
files: litellm/.*\.py
exclude: ^litellm/__init__.py$
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
@ -16,11 +31,10 @@ repos:
name: Check if files match
entry: python3 ci_cd/check_files_match.py
language: system
- repo: local
hooks:
- id: mypy
name: mypy
entry: python3 -m mypy --ignore-missing-imports
language: system
types: [python]
files: ^litellm/
# - id: check-file-length
# name: Check file length
# entry: python check_file_length.py
# args: ["10000"] # set your desired maximum number of lines
# language: python
# files: litellm/.*\.py
# exclude: ^litellm/tests/

28
check_file_length.py Normal file
View file

@ -0,0 +1,28 @@
import sys
def check_file_length(max_lines, filenames):
bad_files = []
for filename in filenames:
with open(filename, "r") as file:
lines = file.readlines()
if len(lines) > max_lines:
bad_files.append((filename, len(lines)))
return bad_files
if __name__ == "__main__":
max_lines = int(sys.argv[1])
filenames = sys.argv[2:]
bad_files = check_file_length(max_lines, filenames)
if bad_files:
bad_files.sort(
key=lambda x: x[1], reverse=True
) # Sort files by length in descending order
for filename, length in bad_files:
print(f"{filename}: {length} lines")
sys.exit(1)
else:
sys.exit(0)

View file

@ -0,0 +1,110 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Drop Unsupported Params
Drop unsupported OpenAI params by your LLM Provider.
## Quick Start
```python
import litellm
import os
# set keys
os.environ["COHERE_API_KEY"] = "co-.."
litellm.drop_params = True # 👈 KEY CHANGE
response = litellm.completion(
model="command-r",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
response_format={"key": "value"},
)
```
LiteLLM maps all supported openai params by provider + model (e.g. function calling is supported by anthropic on bedrock but not titan).
See `litellm.get_supported_openai_params("command-r")` [**Code**](https://github.com/BerriAI/litellm/blob/main/litellm/utils.py#L3584)
If a provider/model doesn't support a particular param, you can drop it.
## OpenAI Proxy Usage
```yaml
litellm_settings:
drop_params: true
```
## Pass drop_params in `completion(..)`
Just drop_params when calling specific models
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
# set keys
os.environ["COHERE_API_KEY"] = "co-.."
response = litellm.completion(
model="command-r",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
response_format={"key": "value"},
drop_params=True
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
- litellm_params:
api_base: my-base
model: openai/my-model
drop_params: true # 👈 KEY CHANGE
model_name: my-model
```
</TabItem>
</Tabs>
## Specify params to drop
To drop specific params when calling a provider (E.g. 'logit_bias' for vllm)
Use `additional_drop_params`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
# set keys
os.environ["COHERE_API_KEY"] = "co-.."
response = litellm.completion(
model="command-r",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
response_format={"key": "value"},
additional_drop_params=["response_format"]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
- litellm_params:
api_base: my-base
model: openai/my-model
additional_drop_params: ["response_format"] # 👈 KEY CHANGE
model_name: my-model
```
</TabItem>
</Tabs>
**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.

View file

@ -67,6 +67,10 @@ By default, LiteLLM raises an exception if the openai param being passed in isn'
To drop the param instead, set `litellm.drop_params = True` or `completion(..drop_params=True)`.
This **ONLY DROPS UNSUPPORTED OPENAI PARAMS**.
LiteLLM assumes any non-openai param is provider specific and passes it in as a kwarg in the request body
:::
## Input Params
@ -162,7 +166,7 @@ def completion(
- `function`: *object* - Required.
- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via {"type: "function", "function": {"name": "my_function"}} forces the model to call that function.
- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via `{"type: "function", "function": {"name": "my_function"}}` forces the model to call that function.
- `none` is the default when no functions are present. `auto` is the default if functions are present.

View file

@ -1,90 +0,0 @@
import Image from '@theme/IdealImage';
import QueryParamReader from '../../src/components/queryParamReader.js'
# [Beta] Monitor Logs in Production
:::note
This is in beta. Expect frequent updates, as we improve based on your feedback.
:::
LiteLLM provides an integration to let you monitor logs in production.
👉 Jump to our sample LiteLLM Dashboard: https://admin.litellm.ai/
<Image img={require('../../img/alt_dashboard.png')} alt="Dashboard" />
## Debug your first logs
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_OpenAI.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
### 1. Get your LiteLLM Token
Go to [admin.litellm.ai](https://admin.litellm.ai/) and copy the code snippet with your unique token
<Image img={require('../../img/hosted_debugger_usage_page.png')} alt="Usage" />
### 2. Set up your environment
**Add it to your .env**
```python
import os
os.env["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
```
**Turn on LiteLLM Client**
```python
import litellm
litellm.client = True
```
### 3. Make a normal `completion()` call
```python
import litellm
from litellm import completion
import os
# set env variables
os.environ["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
os.environ["OPENAI_API_KEY"] = "openai key"
litellm.use_client = True # enable logging dashboard
messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call
response = completion(model="gpt-3.5-turbo", messages=messages)
```
Your `completion()` call print with a link to your session dashboard (https://admin.litellm.ai/<your_unique_token>)
In the above case it would be: [`admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb`](https://admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb)
Click on your personal dashboard link. Here's how you can find it 👇
<Image img={require('../../img/dash_output.png')} alt="Dashboard" />
[👋 Tell us if you need better privacy controls](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version?month=2023-08)
### 3. Review request log
Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider.
Ah! So we can see that this request was made to a **Baseten** (see litellm_params > custom_llm_provider) for a model with ID - **7qQNLDB** (see model). The message sent was - `"Hey, how's it going?"` and the response received was - `"As an AI language model, I don't have feelings or emotions, but I can assist you with your queries. How can I assist you today?"`
<Image img={require('../../img/dashboard_log.png')} alt="Dashboard Log Row" />
:::info
🎉 Congratulations! You've successfully debugger your first log!
:::

View file

@ -2,6 +2,15 @@ import Image from '@theme/IdealImage';
# Athina
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
[Athina](https://athina.ai/) is an evaluation framework and production monitoring platform for your LLM-powered app. Athina is designed to enhance the performance and reliability of AI applications through real-time monitoring, granular analytics, and plug-and-play evaluations.
<Image img={require('../../img/athina_dashboard.png')} />

View file

@ -1,5 +1,14 @@
# Greenscale - Track LLM Spend and Responsible Usage
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
## Getting Started

View file

@ -1,4 +1,13 @@
# Helicone Tutorial
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage.
## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage';
# Langfuse - Logging LLM Input/Output
# 🔥 Langfuse - Logging LLM Input/Output
LangFuse is open Source Observability & Analytics for LLM Apps
Detailed production traces and a granular view on quality, cost and latency
@ -122,10 +122,12 @@ response = completion(
metadata={
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
"generation_id": "gen-id22", # set langfuse Generation ID
"parent_observation_id": "obs-id9" # set langfuse Parent Observation ID
"version": "test-generation-version" # set langfuse Generation Version
"trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"], # set langfuse Tags
"trace_name": "new-trace-name" # set langfuse Trace Name
"trace_id": "trace-id22", # set langfuse Trace ID
"trace_metadata": {"key": "value"}, # set langfuse Trace Metadata
"trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version)
@ -147,9 +149,10 @@ print(response)
You can also pass `metadata` as part of the request header with a `langfuse_*` prefix:
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'langfuse_trace_id: trace-id22' \
curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--header 'langfuse_trace_id: trace-id2' \
--header 'langfuse_trace_user_id: user-id2' \
--header 'langfuse_trace_metadata: {"key":"value"}' \
--data '{
@ -190,9 +193,10 @@ The following parameters can be updated on a continuation of a trace by passing
#### Generation Specific Parameters
* `generation_id` - Identifier for the generation, auto-generated by default
* `generation_name` - Identifier for the generation, auto-generated by default
* `prompt` - Langfuse prompt object used for the generation, defaults to None
* `generation_id` - Identifier for the generation, auto-generated by default
* `generation_name` - Identifier for the generation, auto-generated by default
* `parent_observation_id` - Identifier for the parent observation, defaults to `None`
* `prompt` - Langfuse prompt object used for the generation, defaults to `None`
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.

View file

@ -1,6 +1,16 @@
import Image from '@theme/IdealImage';
# Langsmith - Logging LLM Input/Output
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
An all-in-one developer platform for every step of the application lifecycle
https://smith.langchain.com/

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage';
# Logfire - Logging LLM Input/Output
# 🔥 Logfire - Logging LLM Input/Output
Logfire is open Source Observability & Analytics for LLM Apps
Detailed production traces and a granular view on quality, cost and latency
@ -14,10 +14,14 @@ join our [discord](https://discord.gg/wuPM9dRgDw)
## Pre-Requisites
Ensure you have run `pip install logfire` for this integration
Ensure you have installed the following packages to use this integration
```shell
pip install logfire litellm
pip install litellm
pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0
```
## Quick Start
@ -25,8 +29,7 @@ pip install logfire litellm
Get your Logfire token from [Logfire](https://logfire.pydantic.dev/)
```python
litellm.success_callback = ["logfire"]
litellm.failure_callback = ["logfire"] # logs errors to logfire
litellm.callbacks = ["logfire"]
```
```python

View file

@ -1,5 +1,13 @@
# Lunary - Logging and tracing LLM input/output
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
[Lunary](https://lunary.ai/) is an open-source AI developer platform providing observability, prompt management, and evaluation tools for AI developers.
<video controls width='900' >

View file

@ -1,5 +1,16 @@
import Image from '@theme/IdealImage';
# Promptlayer Tutorial
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
Promptlayer is a platform for prompt engineers. Log OpenAI requests. Search usage history. Track performance. Visually manage prompt templates.
<Image img={require('../../img/promptlayer.png')} />

View file

@ -1,5 +1,14 @@
import Image from '@theme/IdealImage';
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
# Sentry - Log LLM Exceptions
[Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration

View file

@ -1,4 +1,12 @@
# Supabase Tutorial
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
[Supabase](https://supabase.com/) is an open source Firebase alternative.
Start your project with a Postgres database, Authentication, instant APIs, Edge Functions, Realtime subscriptions, Storage, and Vector embeddings.

View file

@ -1,6 +1,16 @@
import Image from '@theme/IdealImage';
# Weights & Biases - Logging LLM Input/Output
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
Weights & Biases helps AI developers build better models faster https://wandb.ai
<Image img={require('../../img/wandb.png')} />

View file

@ -4,6 +4,7 @@ import TabItem from '@theme/TabItem';
# Anthropic
LiteLLM supports
- `claude-3.5`
- `claude-3` (`claude-3-haiku-20240307`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`)
- `claude-2`
- `claude-2.1`
@ -171,6 +172,7 @@ print(response)
|------------------|--------------------------------------------|
| claude-3-haiku | `completion('claude-3-haiku-20240307', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-3-opus | `completion('claude-3-opus-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-3-5-sonnet | `completion('claude-3-5-sonnet-20240620', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-3-sonnet | `completion('claude-3-sonnet-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-2.1 | `completion('claude-2.1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |

View file

@ -68,6 +68,7 @@ response = litellm.completion(
| Model Name | Function Call |
|------------------|----------------------------------------|
| gpt-4o | `completion('azure/<your deployment name>', messages)` |
| gpt-4 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-0314 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-0613 | `completion('azure/<your deployment name>', messages)` |
@ -85,7 +86,8 @@ response = litellm.completion(
## Azure OpenAI Vision Models
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4-vision | `response = completion(model="azure/<your deployment name>", messages=messages)` |
| gpt-4-vision | `completion(model="azure/<your deployment name>", messages=messages)` |
| gpt-4o | `completion('azure/<your deployment name>', messages)` |
#### Usage
```python

View file

@ -623,6 +623,7 @@ Here's an example of using a bedrock model with LiteLLM
| Model Name | Command |
|----------------------------|------------------------------------------------------------------|
| Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
| Anthropic Claude-V3 Opus | `completion(model='bedrock/anthropic.claude-3-opus-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |

View file

@ -0,0 +1,255 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Codestral API [Mistral AI]
Codestral is available in select code-completion plugins but can also be queried directly. See the documentation for more details.
## API Key
```python
# env variable
os.environ['CODESTRAL_API_KEY']
```
## FIM / Completions
:::info
Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createFIMCompletion
:::
<Tabs>
<TabItem value="no-streaming" label="No Streaming">
#### Sample Usage
```python
import os
import litellm
os.environ['CODESTRAL_API_KEY']
response = await litellm.atext_completion(
model="text-completion-codestral/codestral-2405",
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
suffix="return True", # optional
temperature=0, # optional
top_p=1, # optional
max_tokens=10, # optional
min_tokens=10, # optional
seed=10, # optional
stop=["return"], # optional
)
```
#### Expected Response
```json
{
"id": "b41e0df599f94bc1a46ea9fcdbc2aabe",
"object": "text_completion",
"created": 1589478378,
"model": "codestral-latest",
"choices": [
{
"text": "\n assert is_odd(1)\n assert",
"index": 0,
"logprobs": null,
"finish_reason": "length"
}
],
"usage": {
"prompt_tokens": 5,
"completion_tokens": 7,
"total_tokens": 12
}
}
```
</TabItem>
<TabItem value="stream" label="Streaming">
#### Sample Usage - Streaming
```python
import os
import litellm
os.environ['CODESTRAL_API_KEY']
response = await litellm.atext_completion(
model="text-completion-codestral/codestral-2405",
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
suffix="return True", # optional
temperature=0, # optional
top_p=1, # optional
stream=True,
seed=10, # optional
stop=["return"], # optional
)
async for chunk in response:
print(chunk)
```
#### Expected Response
```json
{
"id": "726025d3e2d645d09d475bb0d29e3640",
"object": "text_completion",
"created": 1718659669,
"choices": [
{
"text": "This",
"index": 0,
"logprobs": null,
"finish_reason": null
}
],
"model": "codestral-2405",
}
```
</TabItem>
</Tabs>
### Supported Models
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
| Model Name | Function Call |
|----------------|--------------------------------------------------------------|
| Codestral Latest | `completion(model="text-completion-codestral/codestral-latest", messages)` |
| Codestral 2405 | `completion(model="text-completion-codestral/codestral-2405", messages)`|
## Chat Completions
:::info
Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createChatCompletion
:::
<Tabs>
<TabItem value="no-streaming" label="No Streaming">
#### Sample Usage
```python
import os
import litellm
os.environ['CODESTRAL_API_KEY']
response = await litellm.acompletion(
model="codestral/codestral-latest",
messages=[
{
"role": "user",
"content": "Hey, how's it going?",
}
],
temperature=0.0, # optional
top_p=1, # optional
max_tokens=10, # optional
safe_prompt=False, # optional
seed=12, # optional
)
```
#### Expected Response
```json
{
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": "codestral/codestral-latest",
"system_fingerprint": None,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "\n\nHello there, how may I assist you today?",
},
"logprobs": null,
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 9,
"completion_tokens": 12,
"total_tokens": 21
}
}
```
</TabItem>
<TabItem value="stream" label="Streaming">
#### Sample Usage - Streaming
```python
import os
import litellm
os.environ['CODESTRAL_API_KEY']
response = await litellm.acompletion(
model="codestral/codestral-latest",
messages=[
{
"role": "user",
"content": "Hey, how's it going?",
}
],
stream=True, # optional
temperature=0.0, # optional
top_p=1, # optional
max_tokens=10, # optional
safe_prompt=False, # optional
seed=12, # optional
)
async for chunk in response:
print(chunk)
```
#### Expected Response
```json
{
"id":"chatcmpl-123",
"object":"chat.completion.chunk",
"created":1694268190,
"model": "codestral/codestral-latest",
"system_fingerprint": None,
"choices":[
{
"index":0,
"delta":{"role":"assistant","content":"gm"},
"logprobs":null,
" finish_reason":null
}
]
}
```
</TabItem>
</Tabs>
### Supported Models
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
| Model Name | Function Call |
|----------------|--------------------------------------------------------------|
| Codestral Latest | `completion(model="codestral/codestral-latest", messages)` |
| Codestral 2405 | `completion(model="codestral/codestral-2405", messages)`|

View file

@ -1,6 +1,13 @@
# DeepInfra
https://deepinfra.com/
:::tip
**We support ALL DeepInfra models, just set `model=deepinfra/<any-model-on-deepinfra>` as a prefix when sending litellm requests**
:::
## API Key
```python
# env variable
@ -38,13 +45,11 @@ for chunk in response:
## Chat Models
| Model Name | Function Call |
|------------------|--------------------------------------|
| meta-llama/Meta-Llama-3-8B-Instruct | `completion(model="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct", messages)` |
| meta-llama/Meta-Llama-3-70B-Instruct | `completion(model="deepinfra/meta-llama/Meta-Llama-3-70B-Instruct", messages)` |
| meta-llama/Llama-2-70b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-70b-chat-hf", messages)` |
| meta-llama/Llama-2-7b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-7b-chat-hf", messages)` |
| meta-llama/Llama-2-13b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-13b-chat-hf", messages)` |
| codellama/CodeLlama-34b-Instruct-hf | `completion(model="deepinfra/codellama/CodeLlama-34b-Instruct-hf", messages)` |
| mistralai/Mistral-7B-Instruct-v0.1 | `completion(model="deepinfra/mistralai/Mistral-7B-Instruct-v0.1", messages)` |
| jondurbin/airoboros-l2-70b-gpt4-1.4.1 | `completion(model="deepinfra/jondurbin/airoboros-l2-70b-gpt4-1.4.1", messages)` |

View file

@ -49,6 +49,6 @@ We support ALL Deepseek models, just set `deepseek/` as a prefix when sending co
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` |
| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` |
| deepseek-coder | `completion(model="deepseek/deepseek-coder", messages)` |

View file

@ -45,6 +45,52 @@ response = completion(
)
```
## Tool Calling
```python
from litellm import completion
import os
# set env
os.environ["GEMINI_API_KEY"] = ".."
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="gemini/gemini-1.5-flash",
messages=messages,
tools=tools,
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
# Gemini-Pro-Vision
LiteLLM Supports the following image types passed in `url`
- Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg

View file

@ -1,7 +1,11 @@
# Groq
https://groq.com/
**We support ALL Groq models, just set `groq/` as a prefix when sending completion requests**
:::tip
**We support ALL Groq models, just set `model=groq/<any-model-on-groq>` as a prefix when sending litellm requests**
:::
## API Key
```python

View file

@ -223,6 +223,17 @@ response = completion(
```
## OpenAI Fine Tuned Models
| Model Name | Function Call |
|---------------------------|-----------------------------------------------------------------|
| fine tuned `gpt-4-0613` | `response = completion(model="ft:gpt-4-0613", messages=messages)` |
| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` |
| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` |
| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` |
| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |
## Advanced
### Parallel Function calling

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenAI (Text Completion)
LiteLLM supports OpenAI text completion models

View file

@ -208,7 +208,7 @@ print(response)
Instead of using the `custom_llm_provider` arg to specify which provider you're using (e.g. together ai), you can just pass the provider name as part of the model name, and LiteLLM will parse it out.
Expected format: <custom_llm_provider>/<model_name>
Expected format: `<custom_llm_provider>/<model_name>`
e.g. completion(model="together_ai/togethercomputer/Llama-2-7B-32K-Instruct", ...)

View file

@ -8,6 +8,152 @@ import TabItem from '@theme/TabItem';
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
## 🆕 `vertex_ai_beta/` route
New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk).
```python
from litellm import completion
import json
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
## COMPLETION CALL
response = completion(
model="vertex_ai_beta/gemini-pro",
messages=[{ "content": "Hello, how are you?","role": "user"}],
vertex_credentials=vertex_credentials_json
)
```
### **System Message**
```python
from litellm import completion
import json
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
response = completion(
model="vertex_ai_beta/gemini-pro",
messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}],
vertex_credentials=vertex_credentials_json
)
```
### **Function Calling**
Force Gemini to make tool calls with `tool_choice="required"`.
```python
from litellm import completion
import json
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
messages = [
{
"role": "system",
"content": "Your name is Litellm Bot, you are a helpful assistant",
},
# User asks for their name and weather in San Francisco
{
"role": "user",
"content": "Hello, what is your name and can you tell me the weather?",
},
]
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
}
},
"required": ["location"],
},
},
}
]
data = {
"model": "vertex_ai_beta/gemini-1.5-pro-preview-0514"),
"messages": messages,
"tools": tools,
"tool_choice": "required",
"vertex_credentials": vertex_credentials_json
}
## COMPLETION CALL
print(completion(**data))
```
### **JSON Schema**
```python
from litellm import completion
## GET CREDENTIALS
file_path = 'path/to/vertex_ai_service_account.json'
# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
messages = [
{
"role": "user",
"content": """
List 5 popular cookie recipes.
Using this JSON schema:
Recipe = {"recipe_name": str}
Return a `list[Recipe]`
"""
}
]
completion(model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" })
```
## Pre-requisites
* `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
* Authentication:
@ -140,7 +286,7 @@ In certain use-cases you may need to make calls to the models and pass [safety s
```python
response = completion(
model="gemini/gemini-pro",
model="vertex_ai/gemini-pro",
messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
safety_settings=[
{
@ -254,6 +400,7 @@ litellm.vertex_location = "us-central1 # Your Location
| Model Name | Function Call |
|------------------|--------------------------------------|
| claude-3-opus@20240229 | `completion('vertex_ai/claude-3-opus@20240229', messages)` |
| claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
@ -363,8 +510,8 @@ response = completion(
## Gemini 1.5 Pro (and Vision)
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-1.5-pro | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
| gemini-1.5-flash-preview-0514 | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
| gemini-1.5-pro | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-1.5-pro', messages)` |
| gemini-1.5-flash-preview-0514 | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-flash-preview-0514', messages)` |
| gemini-1.5-pro-preview-0514 | `completion('gemini-1.5-pro-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-pro-preview-0514', messages)` |
@ -680,6 +827,3 @@ s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial

View file

@ -1,3 +1,5 @@
import Image from '@theme/IdealImage';
# 🚨 Alerting / Webhooks
Get alerts for:
@ -15,6 +17,11 @@ Get alerts for:
- **Spend** Weekly & Monthly spend per Team, Tag
Works across:
- [Slack](#quick-start)
- [Discord](#advanced---using-discord-webhooks)
- [Microsoft Teams](#advanced---using-ms-teams-webhooks)
## Quick Start
Set up a slack alert channel to receive alerts from proxy.
@ -25,41 +32,33 @@ Get a slack webhook url from https://api.slack.com/messaging/webhooks
You can also use Discord Webhooks, see [here](#using-discord-webhooks)
### Step 2: Update config.yaml
- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
- Just for testing purposes, let's save a bad key to our proxy.
Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
```bash
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
```
### Step 2: Setup Proxy
```yaml
model_list:
model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
api_key: "my-bad-key" # 👈 bad key
general_settings:
alerting: ["slack"]
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
environment_variables:
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
SLACK_DAILY_REPORT_FREQUENCY: "86400" # 24 hours; Optional: defaults to 12 hours
```
### Step 3: Start proxy
Start proxy
```bash
$ litellm --config /path/to/config.yaml
```
## Testing Alerting is Setup Correctly
Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel
### Step 3: Test it!
```shell
curl -X GET 'http://localhost:4000/health/services?service=slack' \
-H 'Authorization: Bearer sk-1234'
```bash
curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \
-H 'Authorization: Bearer sk-1234'
```
## Advanced - Redacting Messages from Alerts
@ -77,7 +76,34 @@ litellm_settings:
```
## Advanced - Add Metadata to alerts
Add alerting metadata to proxy calls for debugging.
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [],
extra_body={
"metadata": {
"alerting_metadata": {
"hello": "world"
}
}
}
)
```
**Expected Response**
<Image img={require('../../img/alerting_metadata.png')}/>
## Advanced - Opting into specific alert types
@ -108,6 +134,48 @@ AlertType = Literal[
```
## Advanced - Using MS Teams Webhooks
MS Teams provides a slack compatible webhook url that you can use for alerting
##### Quick Start
1. [Get a webhook url](https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook?tabs=newteams%2Cdotnet#create-an-incoming-webhook) for your Microsoft Teams channel
2. Add it to your .env
```bash
SLACK_WEBHOOK_URL="https://berriai.webhook.office.com/webhookb2/...6901/IncomingWebhook/b55fa0c2a48647be8e6effedcd540266/e04b1092-4a3e-44a2-ab6b-29a0a4854d1d"
```
3. Add it to your litellm config
```yaml
model_list:
model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
api_key: "my-bad-key" # 👈 bad key
general_settings:
alerting: ["slack"]
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
```
4. Run health check!
Call the proxy `/health/services` endpoint to test if your alerting connection is correctly setup.
```bash
curl --location 'http://0.0.0.0:4000/health/services?service=slack' \
--header 'Authorization: Bearer sk-1234'
```
**Expected Response**
<Image img={require('../../img/ms_teams_alerting.png')}/>
## Advanced - Using Discord Webhooks
Discord provides a slack compatible webhook url that you can use for alerting
@ -139,7 +207,6 @@ environment_variables:
SLACK_WEBHOOK_URL: "https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
```
That's it ! You're ready to go !
## Advanced - [BETA] Webhooks for Budget Alerts

View file

@ -252,6 +252,31 @@ $ litellm --config /path/to/config.yaml
```
## Multiple OpenAI Organizations
Add all openai models across all OpenAI organizations with just 1 model definition
```yaml
- model_name: *
litellm_params:
model: openai/*
api_key: os.environ/OPENAI_API_KEY
organization:
- org-1
- org-2
- org-3
```
LiteLLM will automatically create separate deployments for each org.
Confirm this via
```bash
curl --location 'http://0.0.0.0:4000/v1/model/info' \
--header 'Authorization: Bearer ${LITELLM_KEY}' \
--data ''
```
## Load Balancing
:::info

View file

@ -27,7 +27,7 @@ docker-compose up
<Tabs>
<TabItem value="basic" label="Basic">
<TabItem value="basic" label="Basic (No DB)">
### Step 1. CREATE config.yaml
@ -98,7 +98,13 @@ docker run ghcr.io/berriai/litellm:main-latest --port 8002 --num_workers 8
```
</TabItem>
<TabItem value="terraform" label="Terraform">
s/o [Nicholas Cecere](https://www.linkedin.com/in/nicholas-cecere-24243549/) for his LiteLLM User Management Terraform
👉 [Go here for Terraform](https://github.com/ncecere/terraform-litellm-user-mgmt)
</TabItem>
<TabItem value="base-image" label="use litellm as a base image">
```shell
@ -380,6 +386,7 @@ kubectl port-forward service/litellm-service 4000:4000
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
</TabItem>
<TabItem value="helm-deploy" label="Helm">
@ -425,7 +432,6 @@ If you need to set your litellm proxy config.yaml, you can find this in [values.
</TabItem>
<TabItem value="helm-oci" label="Helm OCI Registry (GHCR)">
:::info
@ -669,7 +675,7 @@ Once the stack is created, get the DatabaseURL of the Database resource, copy th
#### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
From the EC2 console, connect to the instance created by the stack (e.g., using SSH).
Run the following command, replacing <database_url> with the value you copied in step 2
Run the following command, replacing `<database_url>` with the value you copied in step 2
```shell
docker run --name litellm-proxy \

View file

@ -5,6 +5,7 @@ import Image from '@theme/IdealImage';
Send an Email to your users when:
- A Proxy API Key is created for them
- Their API Key crosses it's Budget
- All Team members of a LiteLLM Team -> when the team crosses it's budget
<Image img={require('../../img/email_notifs.png')} style={{ width: '500px' }}/>

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Model Management
Add new models + Get model info without restarting proxy.

View file

@ -1,3 +1,5 @@
import Image from '@theme/IdealImage';
# LiteLLM Proxy Performance
### Throughput - 30% Increase

View file

@ -1,4 +1,4 @@
# Grafana, Prometheus metrics [BETA]
# 📈 Prometheus metrics [BETA]
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
@ -54,6 +54,13 @@ http://localhost:4000/metrics
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
### Budget Metrics
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_team_budget_metric` | Remaining Budget for Team (A team created on LiteLLM) |
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
## Monitor System Health
To monitor the health of litellm adjacent services (redis / postgres), do:

View file

@ -409,6 +409,28 @@ print(response)
</Tabs>
### Content Policy Fallbacks
Fallback across providers (e.g. from Azure OpenAI to Anthropic) if you hit content policy violation errors.
```yaml
model_list:
- model_name: gpt-3.5-turbo-small
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
- model_name: claude-opus
litellm_params:
model: claude-3-opus-20240229
api_key: os.environ/ANTHROPIC_API_KEY
litellm_settings:
content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}]
```
### EU-Region Filtering (Pre-Call Checks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.

View file

@ -123,4 +123,18 @@ LiteLLM Enterprise: Enable [SSO login](./ui.md#setup-ssoauth-for-ui)
4. User can now create their own keys
<Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} />
<Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} />
## Advanced
### Setting custom logout URLs
Set `PROXY_LOGOUT_URL` in your .env if you want users to get redirected to a specific URL when they click logout
```
export PROXY_LOGOUT_URL="https://www.google.com"
```
<Image img={require('../../img/ui_logout.png')} style={{ width: '400px', height: 'auto' }} />

View file

@ -0,0 +1,154 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💰 Setting Team Budgets
Track spend, set budgets for your Internal Team
## Setting Monthly Team Budgets
### 1. Create a team
- Set `max_budget=000000001` ($ value the team is allowed to spend)
- Set `budget_duration="1d"` (How frequently the budget should update)
<Tabs>
<TabItem value="API" label="API">
Create a new team and set `max_budget` and `budget_duration`
```shell
curl -X POST 'http://0.0.0.0:4000/team/new' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"team_alias": "QA Prod Bot",
"max_budget": 0.000000001,
"budget_duration": "1d"
}'
```
Response
```shell
{
"team_alias": "QA Prod Bot",
"team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a",
"max_budget": 0.0001,
"budget_duration": "1d",
"budget_reset_at": "2024-06-14T22:48:36.594000Z"
}
```
</TabItem>
<TabItem value="UI" label="Admin UI">
<Image img={require('../../img/create_team_gif_good.gif')} />
</TabItem>
</Tabs>
Possible values for `budget_duration`
| `budget_duration` | When Budget will reset |
| --- | --- |
| `budget_duration="1s"` | every 1 second |
| `budget_duration="1m"` | every 1 min |
| `budget_duration="1h"` | every 1 hour |
| `budget_duration="1d"` | every 1 day |
| `budget_duration="1mo"` | every 1 month |
### 2. Create a key for the `team`
Create a key for Team=`QA Prod Bot` and `team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"` from Step 1
<Tabs>
<TabItem value="api" label="API">
💡 **The Budget for Team="QA Prod Bot" budget will apply to this team**
```shell
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a"}'
```
Response
```shell
{"team_id":"de35b29e-6ca8-4f47-b804-2b79d07aa99a", "key":"sk-5qtncoYjzRcxMM4bDRktNQ"}
```
</TabItem>
<TabItem value="UI" label="Admin UI">
<Image img={require('../../img/create_key_in_team.gif')} />
</TabItem>
</Tabs>
### 3. Test It
Use the key from step 2 and run this Request twice
<Tabs>
<TabItem value="api" label="API">
```shell
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Authorization: Bearer sk-mso-JSykEGri86KyOvgxBw' \
-H 'Content-Type: application/json' \
-d ' {
"model": "llama3",
"messages": [
{
"role": "user",
"content": "hi"
}
]
}'
```
On the 2nd response - expect to see the following exception
```shell
{
"error": {
"message": "Budget has been exceeded! Current cost: 3.5e-06, Max budget: 1e-09",
"type": "auth_error",
"param": null,
"code": 400
}
}
```
</TabItem>
<TabItem value="UI" label="Admin UI">
<Image img={require('../../img/test_key_budget.gif')} />
</TabItem>
</Tabs>
## Advanced
### Prometheus metrics for `remaining_budget`
[More info about Prometheus metrics here](https://docs.litellm.ai/docs/proxy/prometheus)
You'll need the following in your proxy config.yaml
```yaml
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]
```
Expect to see this metric on prometheus to track the Remaining Budget for the team
```shell
litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"} 9.699999999999992e-06
```

View file

@ -62,6 +62,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
You can:
- Add budgets to Teams
:::info
**Step-by step tutorial on setting, resetting budgets on Teams here (API or using Admin UI)**
👉 [https://docs.litellm.ai/docs/proxy/team_budgets](https://docs.litellm.ai/docs/proxy/team_budgets)
:::
#### **Add budgets to teams**
```shell
@ -413,6 +421,63 @@ curl 'http://0.0.0.0:4000/key/generate' \
</TabItem>
</Tabs>
### Reset Budgets
Reset budgets across keys/internal users/teams/customers
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
<Tabs>
<TabItem value="users" label="Internal Users">
```bash
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"max_budget": 10,
"budget_duration": 10s, # 👈 KEY CHANGE
}'
```
</TabItem>
<TabItem value="keys" label="Keys">
```bash
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"max_budget": 10,
"budget_duration": 10s, # 👈 KEY CHANGE
}'
```
</TabItem>
<TabItem value="teams" label="Teams">
```bash
curl 'http://0.0.0.0:4000/team/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"max_budget": 10,
"budget_duration": 10s, # 👈 KEY CHANGE
}'
```
</TabItem>
</Tabs>
**Note:** By default, the server checks for resets every 10 minutes, to minimize DB calls.
To change this, set `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`
E.g.: Check every 1 seconds
```yaml
general_settings:
proxy_budget_rescheduler_min_time: 1
proxy_budget_rescheduler_max_time: 1
```
## Set Rate Limits
You can set:

View file

@ -95,7 +95,7 @@ print(response)
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
- `router.aimage_generation()` - async image generation calls
## Advanced - Routing Strategies
## Advanced - Routing Strategies ⭐️
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
Router provides 4 strategies for routing your calls across multiple deployments:
@ -262,7 +262,7 @@ if response is not None:
)
```
### Set Time Window
#### Set Time Window
Set time window for how far back to consider when averaging latency for a deployment.
@ -278,7 +278,7 @@ router_settings:
routing_strategy_args: {"ttl": 10}
```
### Set Lowest Latency Buffer
#### Set Lowest Latency Buffer
Set a buffer within which deployments are candidates for making calls to.
@ -468,6 +468,122 @@ asyncio.run(router_acompletion())
```
</TabItem>
<TabItem value="custom" label="Custom Routing Strategy">
**Plugin a custom routing strategy to select deployments**
Step 1. Define your custom routing strategy
```python
from litellm.router import CustomRoutingStrategyBase
class CustomRoutingStrategy(CustomRoutingStrategyBase):
async def async_get_available_deployment(
self,
model: str,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
):
"""
Asynchronously retrieves the available deployment based on the given parameters.
Args:
model (str): The name of the model.
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
Returns:
Returns an element from litellm.router.model_list
"""
print("In CUSTOM async get available deployment")
model_list = router.model_list
print("router model list=", model_list)
for model in model_list:
if isinstance(model, dict):
if model["litellm_params"]["model"] == "openai/very-special-endpoint":
return model
pass
def get_available_deployment(
self,
model: str,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
):
"""
Synchronously retrieves the available deployment based on the given parameters.
Args:
model (str): The name of the model.
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
Returns:
Returns an element from litellm.router.model_list
"""
pass
```
Step 2. Initialize Router with custom routing strategy
```python
from litellm import Router
router = Router(
model_list=[
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/very-special-endpoint",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/", # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)
"api_key": "fake-key",
},
"model_info": {"id": "very-special-endpoint"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint"},
},
],
set_verbose=True,
debug_level="DEBUG",
timeout=1,
) # type: ignore
router.set_custom_routing_strategy(CustomRoutingStrategy()) # 👈 Set your routing strategy here
```
Step 3. Test your routing strategy. Expect your custom routing strategy to be called when running `router.acompletion` requests
```python
for _ in range(10):
response = await router.acompletion(
model="azure-model", messages=[{"role": "user", "content": "hello"}]
)
print(response)
_picked_model_id = response._hidden_params["model_id"]
print("picked model=", _picked_model_id)
```
</TabItem>
<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
Picks a deployment based on the lowest cost
@ -563,7 +679,6 @@ asyncio.run(router_acompletion())
```
</TabItem>
</Tabs>
## Basic Reliability
@ -790,85 +905,205 @@ If the error is a context window exceeded error, fall back to a larger model gro
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
You can also set 'default_fallbacks', in case a specific model group is misconfigured / bad.
You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad.
There are 3 types of fallbacks:
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
**Content Policy Violation Fallback**
Key change:
```python
from litellm import Router
model_list = [
{ # list of model deployments
"model_name": "azure/gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
},
"tpm": 240000,
"rpm": 1800
},
{ # list of model deployments
"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
},
"tpm": 240000,
"rpm": 1800
},
{
"model_name": "azure/gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
},
"tpm": 240000,
"rpm": 1800
},
{
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo",
"api_key": os.getenv("OPENAI_API_KEY"),
},
"tpm": 1000000,
"rpm": 9000
},
{
"model_name": "gpt-3.5-turbo-16k", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo-16k",
"api_key": os.getenv("OPENAI_API_KEY"),
},
"tpm": 1000000,
"rpm": 9000
}
]
router = Router(model_list=model_list,
fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
default_fallbacks=["gpt-3.5-turbo-16k"],
context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
set_verbose=True)
user_message = "Hello, whats the weather in San Francisco??"
messages = [{"content": user_message, "role": "user"}]
# normal fallback call
response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
# context window fallback call
response = router.completion(model="azure/gpt-3.5-turbo-context-fallback", messages=messages)
print(f"response: {response}")
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
router = Router(
model_list=[
{
"model_name": "claude-2",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": Exception("content filtering policy"),
},
},
{
"model_name": "my-fallback-model",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": "This works!",
},
},
],
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
# fallbacks=[..], # [OPTIONAL]
# context_window_fallbacks=[..], # [OPTIONAL]
)
response = router.completion(
model="claude-2",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
In your proxy config.yaml just add this line 👇
```yaml
router_settings:
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
</TabItem>
</Tabs>
**Context Window Exceeded Fallback**
Key change:
```python
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
router = Router(
model_list=[
{
"model_name": "claude-2",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": Exception("prompt is too long"),
},
},
{
"model_name": "my-fallback-model",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": "This works!",
},
},
],
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
# fallbacks=[..], # [OPTIONAL]
# content_policy_fallbacks=[..], # [OPTIONAL]
)
response = router.completion(
model="claude-2",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
In your proxy config.yaml just add this line 👇
```yaml
router_settings:
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
</TabItem>
</Tabs>
**Regular Fallbacks**
Key change:
```python
fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import Router
router = Router(
model_list=[
{
"model_name": "claude-2",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": Exception("this is a rate limit error"),
},
},
{
"model_name": "my-fallback-model",
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": "This works!",
},
},
],
fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
# context_window_fallbacks=[..], # [OPTIONAL]
# content_policy_fallbacks=[..], # [OPTIONAL]
)
response = router.completion(
model="claude-2",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
In your proxy config.yaml just add this line 👇
```yaml
router_settings:
fallbacks=[{"claude-2": ["my-fallback-model"]}]
```
Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
</TabItem>
</Tabs>
### Caching
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.

View file

@ -23,9 +23,13 @@ https://api.together.xyz/playground/chat?model=togethercomputer%2Fllama-2-70b-ch
model_name = "together_ai/togethercomputer/llama-2-70b-chat"
response = completion(model=model_name, messages=messages)
print(response)
```
```
{'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': "\n\nI'm not able to provide real-time weather information. However, I can suggest"}}], 'created': 1691629657.9288375, 'model': 'togethercomputer/llama-2-70b-chat', 'usage': {'prompt_tokens': 9, 'completion_tokens': 17, 'total_tokens': 26}}
```
LiteLLM handles the prompt formatting for Together AI's Llama2 models as well, converting your message to the

View file

@ -38,9 +38,6 @@ const config = {
disableInDev: false,
},
],
[ require.resolve('docusaurus-lunr-search'), {
languages: ['en'] // language codes
}],
() => ({
name: 'cripchat',
injectHtmlTags() {
@ -90,6 +87,15 @@ const config = {
({
// Replace with your project's social card
image: 'img/docusaurus-social-card.png',
algolia: {
// The application ID provided by Algolia
appId: 'NU85Y4NU0B',
// Public API key: it is safe to commit it
apiKey: '4e0cf8c3020d0c876ad9174cea5c01fb',
indexName: 'litellm',
},
navbar: {
title: '🚅 LiteLLM',
items: [
@ -138,8 +144,8 @@ const config = {
title: 'Docs',
items: [
{
label: 'Tutorial',
to: '/docs/index',
label: 'Getting Started',
to: 'https://docs.litellm.ai/docs/',
},
],
},

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

File diff suppressed because it is too large Load diff

View file

@ -23,8 +23,8 @@
"docusaurus": "^1.14.7",
"docusaurus-lunr-search": "^2.4.1",
"prism-react-renderer": "^1.3.5",
"react": "^17.0.2",
"react-dom": "^17.0.2",
"react": "^18.1.0",
"react-dom": "^18.1.0",
"sharp": "^0.32.6",
"uuid": "^9.0.1"
},

View file

@ -43,6 +43,7 @@ const sidebars = {
"proxy/cost_tracking",
"proxy/self_serve",
"proxy/users",
"proxy/team_budgets",
"proxy/customers",
"proxy/billing",
"proxy/user_keys",
@ -54,6 +55,7 @@ const sidebars = {
items: ["proxy/logging", "proxy/streaming_logging"],
},
"proxy/ui",
"proxy/prometheus",
"proxy/email",
"proxy/multiple_admins",
"proxy/team_based_routing",
@ -70,7 +72,6 @@ const sidebars = {
"proxy/pii_masking",
"proxy/prompt_injection",
"proxy/caching",
"proxy/prometheus",
"proxy/call_hooks",
"proxy/rules",
"proxy/cli",
@ -87,6 +88,7 @@ const sidebars = {
},
items: [
"completion/input",
"completion/drop_params",
"completion/prompt_formatting",
"completion/output",
"exception_mapping",
@ -133,10 +135,11 @@ const sidebars = {
"providers/vertex",
"providers/palm",
"providers/gemini",
"providers/mistral",
"providers/anthropic",
"providers/aws_sagemaker",
"providers/bedrock",
"providers/mistral",
"providers/codestral",
"providers/cohere",
"providers/anyscale",
"providers/huggingface",
@ -170,10 +173,8 @@ const sidebars = {
"proxy/custom_pricing",
"routing",
"scheduler",
"rules",
"set_keys",
"budget_manager",
"contributing",
"secret",
"completion/token_usage",
"load_test",
@ -181,11 +182,11 @@ const sidebars = {
type: "category",
label: "Logging & Observability",
items: [
"observability/langfuse_integration",
"observability/logfire_integration",
"debugging/local_debugging",
"observability/raw_request_response",
"observability/callbacks",
"observability/custom_callback",
"observability/langfuse_integration",
"observability/sentry",
"observability/lago",
"observability/openmeter",
@ -223,14 +224,16 @@ const sidebars = {
},
{
type: "category",
label: "LangChain, LlamaIndex Integration",
items: ["langchain/langchain"],
label: "LangChain, LlamaIndex, Instructor Integration",
items: ["langchain/langchain", "tutorials/instructor"],
},
{
type: "category",
label: "Extras",
items: [
"extras/contributing",
"contributing",
"rules",
"proxy_server",
{
type: "category",

File diff suppressed because it is too large Load diff

View file

@ -93,7 +93,7 @@ class _ENTERPRISE_BannedKeywords(CustomLogger):
response.choices[0], litellm.utils.Choices
):
for word in self.banned_keywords_list:
self.test_violation(test_str=response.choices[0].message.content)
self.test_violation(test_str=response.choices[0].message.content or "")
async def async_post_call_streaming_hook(
self,

View file

@ -122,236 +122,6 @@ async def ui_get_spend_by_tags(
return {"spend_per_tag": ui_tags}
async def view_spend_logs_from_clickhouse(
api_key=None, user_id=None, request_id=None, start_date=None, end_date=None
):
verbose_logger.debug("Reading logs from Clickhouse")
import os
# if user has setup clickhouse
# TODO: Move this to be a helper function
# querying clickhouse for this data
import clickhouse_connect
from datetime import datetime
port = os.getenv("CLICKHOUSE_PORT")
if port is not None and isinstance(port, str):
port = int(port)
client = clickhouse_connect.get_client(
host=os.getenv("CLICKHOUSE_HOST"),
port=port,
username=os.getenv("CLICKHOUSE_USERNAME", ""),
password=os.getenv("CLICKHOUSE_PASSWORD", ""),
)
if (
start_date is not None
and isinstance(start_date, str)
and end_date is not None
and isinstance(end_date, str)
):
# Convert the date strings to datetime objects
start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
# get top spend per day
response = client.query(
f"""
SELECT
toDate(startTime) AS day,
sum(spend) AS total_spend
FROM
spend_logs
WHERE
toDate(startTime) BETWEEN toDate('2024-02-01') AND toDate('2024-02-29')
GROUP BY
day
ORDER BY
total_spend
"""
)
results = []
result_rows = list(response.result_rows)
for response in result_rows:
current_row = {}
current_row["users"] = {"example": 0.0}
current_row["models"] = {}
current_row["spend"] = float(response[1])
current_row["startTime"] = str(response[0])
# stubbed api_key
current_row[""] = 0.0 # type: ignore
results.append(current_row)
return results
else:
# check if spend logs exist, if it does then return last 10 logs, sorted in descending order of startTime
response = client.query(
"""
SELECT
*
FROM
default.spend_logs
ORDER BY
startTime DESC
LIMIT
10
"""
)
# get size of spend logs
num_rows = client.query("SELECT count(*) FROM default.spend_logs")
num_rows = num_rows.result_rows[0][0]
# safely access num_rows.result_rows[0][0]
if num_rows is None:
num_rows = 0
raw_rows = list(response.result_rows)
response_data = {
"logs": raw_rows,
"log_count": num_rows,
}
return response_data
def _create_clickhouse_material_views(client=None, table_names=[]):
# Create Materialized Views if they don't exist
# Materialized Views send new inserted rows to the aggregate tables
verbose_logger.debug("Clickhouse: Creating Materialized Views")
if "daily_aggregated_spend_per_model_mv" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model_mv")
client.command(
"""
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_model_mv
TO daily_aggregated_spend_per_model
AS
SELECT
toDate(startTime) as day,
sumState(spend) AS DailySpend,
model as model
FROM spend_logs
GROUP BY
day, model
"""
)
if "daily_aggregated_spend_per_api_key_mv" not in table_names:
verbose_logger.debug(
"Clickhouse: Creating daily_aggregated_spend_per_api_key_mv"
)
client.command(
"""
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_api_key_mv
TO daily_aggregated_spend_per_api_key
AS
SELECT
toDate(startTime) as day,
sumState(spend) AS DailySpend,
api_key as api_key
FROM spend_logs
GROUP BY
day, api_key
"""
)
if "daily_aggregated_spend_per_user_mv" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user_mv")
client.command(
"""
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_user_mv
TO daily_aggregated_spend_per_user
AS
SELECT
toDate(startTime) as day,
sumState(spend) AS DailySpend,
user as user
FROM spend_logs
GROUP BY
day, user
"""
)
if "daily_aggregated_spend_mv" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_mv")
client.command(
"""
CREATE MATERIALIZED VIEW daily_aggregated_spend_mv
TO daily_aggregated_spend
AS
SELECT
toDate(startTime) as day,
sumState(spend) AS DailySpend
FROM spend_logs
GROUP BY
day
"""
)
def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
# Basic Logging works without this - this is only used for low latency reporting apis
verbose_logger.debug("Clickhouse: Creating Aggregate Tables")
# Create Aggregeate Tables if they don't exist
if "daily_aggregated_spend_per_model" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model")
client.command(
"""
CREATE TABLE daily_aggregated_spend_per_model
(
`day` Date,
`DailySpend` AggregateFunction(sum, Float64),
`model` String
)
ENGINE = SummingMergeTree()
ORDER BY (day, model);
"""
)
if "daily_aggregated_spend_per_api_key" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_api_key")
client.command(
"""
CREATE TABLE daily_aggregated_spend_per_api_key
(
`day` Date,
`DailySpend` AggregateFunction(sum, Float64),
`api_key` String
)
ENGINE = SummingMergeTree()
ORDER BY (day, api_key);
"""
)
if "daily_aggregated_spend_per_user" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user")
client.command(
"""
CREATE TABLE daily_aggregated_spend_per_user
(
`day` Date,
`DailySpend` AggregateFunction(sum, Float64),
`user` String
)
ENGINE = SummingMergeTree()
ORDER BY (day, user);
"""
)
if "daily_aggregated_spend" not in table_names:
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend")
client.command(
"""
CREATE TABLE daily_aggregated_spend
(
`day` Date,
`DailySpend` AggregateFunction(sum, Float64),
)
ENGINE = SummingMergeTree()
ORDER BY (day);
"""
)
return
def _forecast_daily_cost(data: list):
import requests # type: ignore
from datetime import datetime, timedelta

View file

@ -13,7 +13,10 @@ from litellm._logging import (
verbose_logger,
json_logs,
_turn_on_json,
log_level,
)
from litellm.proxy._types import (
KeyManagementSystem,
KeyManagementSettings,
@ -34,7 +37,7 @@ input_callback: List[Union[str, Callable]] = []
success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
service_callback: List[Union[str, Callable]] = []
_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter"]
_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter", "logfire"]
callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
_langfuse_default_tags: Optional[
List[
@ -73,7 +76,7 @@ token: Optional[str] = (
)
telemetry = True
max_tokens = 256 # OpenAI Defaults
drop_params = False
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
modify_params = False
retry = True
### AUTH ###
@ -240,6 +243,7 @@ num_retries: Optional[int] = None # per model endpoint
default_fallbacks: Optional[List] = None
fallbacks: Optional[List] = None
context_window_fallbacks: Optional[List] = None
content_policy_fallbacks: Optional[List] = None
allowed_fails: int = 0
num_retries_per_request: Optional[int] = (
None # for the request overall (incl. fallbacks + model retries)
@ -337,6 +341,7 @@ bedrock_models: List = []
deepinfra_models: List = []
perplexity_models: List = []
watsonx_models: List = []
gemini_models: List = []
for key, value in model_cost.items():
if value.get("litellm_provider") == "openai":
open_ai_chat_completion_models.append(key)
@ -383,13 +388,16 @@ for key, value in model_cost.items():
perplexity_models.append(key)
elif value.get("litellm_provider") == "watsonx":
watsonx_models.append(key)
elif value.get("litellm_provider") == "gemini":
gemini_models.append(key)
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
openai_compatible_endpoints: List = [
"api.perplexity.ai",
"api.endpoints.anyscale.com/v1",
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"codestral.mistral.ai/v1/chat/completions",
"codestral.mistral.ai/v1/fim/completions",
"api.groq.com/openai/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1",
@ -401,6 +409,7 @@ openai_compatible_providers: List = [
"anyscale",
"mistral",
"groq",
"codestral",
"deepseek",
"deepinfra",
"perplexity",
@ -592,6 +601,7 @@ model_list = (
+ maritalk_models
+ vertex_language_models
+ watsonx_models
+ gemini_models
)
provider_list: List = [
@ -607,6 +617,7 @@ provider_list: List = [
"together_ai",
"openrouter",
"vertex_ai",
"vertex_ai_beta",
"palm",
"gemini",
"ai21",
@ -627,6 +638,8 @@ provider_list: List = [
"anyscale",
"mistral",
"groq",
"codestral",
"text-completion-codestral",
"deepseek",
"maritalk",
"voyage",
@ -664,6 +677,7 @@ models_by_provider: dict = {
"perplexity": perplexity_models,
"maritalk": maritalk_models,
"watsonx": watsonx_models,
"gemini": gemini_models,
}
# mapping for those models which have larger equivalents
@ -716,6 +730,7 @@ openai_image_generation_models = ["dall-e-2", "dall-e-3"]
from .timeout import timeout
from .cost_calculator import completion_cost
from litellm.litellm_core_utils.litellm_logging import Logging
from .utils import (
client,
exception_type,
@ -724,12 +739,11 @@ from .utils import (
token_counter,
create_pretrained_tokenizer,
create_tokenizer,
cost_per_token,
supports_function_calling,
supports_parallel_function_calling,
supports_vision,
supports_system_messages,
get_litellm_params,
Logging,
acreate,
get_model_list,
get_max_tokens,
@ -749,9 +763,10 @@ from .utils import (
get_first_chars_messages,
ModelResponse,
ImageResponse,
ImageObject,
get_provider_fields,
)
from .types.utils import ImageObject
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
@ -768,6 +783,7 @@ from .llms.gemini import GeminiConfig
from .llms.nlp_cloud import NLPCloudConfig
from .llms.aleph_alpha import AlephAlphaConfig
from .llms.petals import PetalsConfig
from .llms.vertex_httpx import VertexGeminiConfig
from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
from .llms.sagemaker import SagemakerConfig
@ -792,7 +808,9 @@ from .llms.openai import (
MistralConfig,
MistralEmbeddingConfig,
DeepInfraConfig,
AzureAIStudioConfig,
)
from .llms.text_completion_codestral import MistralTextCompletionConfig
from .llms.azure import (
AzureOpenAIConfig,
AzureOpenAIError,
@ -826,4 +844,4 @@ from .router import Router
from .assistants.main import *
from .batches.main import *
from .scheduler import *
from .cost_calculator import response_cost_calculator
from .cost_calculator import response_cost_calculator, cost_per_token

View file

@ -1,21 +1,40 @@
import logging, os, json
from logging import Formatter
import json
import logging
import os
import traceback
from datetime import datetime
from logging import Formatter
set_verbose = False
if set_verbose is True:
logging.warning(
"`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."
)
json_logs = bool(os.getenv("JSON_LOGS", False))
# Create a handler for the logger (you may need to adapt this based on your needs)
log_level = os.getenv("LITELLM_LOG", "DEBUG")
numeric_level: str = getattr(logging, log_level.upper())
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
handler.setLevel(numeric_level)
class JsonFormatter(Formatter):
def __init__(self):
super(JsonFormatter, self).__init__()
def formatTime(self, record, datefmt=None):
# Use datetime to format the timestamp in ISO 8601 format
dt = datetime.fromtimestamp(record.created)
return dt.isoformat()
def format(self, record):
json_record = {}
json_record["message"] = record.getMessage()
json_record = {
"message": record.getMessage(),
"level": record.levelname,
"timestamp": self.formatTime(record),
}
return json.dumps(json_record)

View file

@ -1192,7 +1192,7 @@ class S3Cache(BaseCache):
return cached_response
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "NoSuchKey":
verbose_logger.error(
verbose_logger.debug(
f"S3 Cache: The specified key '{key}' does not exist in the S3 bucket."
)
return None

View file

@ -1,21 +1,292 @@
# What is this?
## File for 'response_cost' calculation in Logging
from typing import Optional, Union, Literal, List
import time
from typing import List, Literal, Optional, Tuple, Union
import litellm
import litellm._logging
from litellm import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_per_character as google_cost_per_character,
)
from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_per_token as google_cost_per_token,
)
from litellm.utils import (
ModelResponse,
CallTypes,
CostPerToken,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
ModelResponse,
TextCompletionResponse,
CallTypes,
cost_per_token,
TranscriptionResponse,
print_verbose,
CostPerToken,
token_counter,
)
import litellm
from litellm import verbose_logger
def _cost_per_token_custom_pricing_helper(
prompt_tokens: float = 0,
completion_tokens: float = 0,
response_time_ms=None,
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> Optional[Tuple[float, float]]:
"""Internal helper function for calculating cost, if custom pricing given"""
if custom_cost_per_token is None and custom_cost_per_second is None:
return None
if custom_cost_per_token is not None:
input_cost = custom_cost_per_token["input_cost_per_token"] * prompt_tokens
output_cost = custom_cost_per_token["output_cost_per_token"] * completion_tokens
return input_cost, output_cost
elif custom_cost_per_second is not None:
output_cost = custom_cost_per_second * response_time_ms / 1000 # type: ignore
return 0, output_cost
return None
def cost_per_token(
model: str = "",
prompt_tokens: float = 0,
completion_tokens: float = 0,
response_time_ms=None,
custom_llm_provider: Optional[str] = None,
region_name=None,
### CHARACTER PRICING ###
prompt_characters: float = 0,
completion_characters: float = 0,
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Parameters:
model (str): The name of the model to use. Default is ""
prompt_tokens (int): The number of tokens in the prompt.
completion_tokens (int): The number of tokens in the completion.
response_time (float): The amount of time, in milliseconds, it took the call to complete.
prompt_characters (float): The number of characters in the prompt. Used for vertex ai cost calculation.
completion_characters (float): The number of characters in the completion response. Used for vertex ai cost calculation.
custom_llm_provider (str): The llm provider to whom the call was made (see init.py for full list)
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
Returns:
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
"""
args = locals()
if model is None:
raise Exception("Invalid arg. Model cannot be none.")
## CUSTOM PRICING ##
response_cost = _cost_per_token_custom_pricing_helper(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
response_time_ms=response_time_ms,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
if response_cost is not None:
return response_cost[0], response_cost[1]
# given
prompt_tokens_cost_usd_dollar: float = 0
completion_tokens_cost_usd_dollar: float = 0
model_cost_ref = litellm.model_cost
model_with_provider = model
if custom_llm_provider is not None:
model_with_provider = custom_llm_provider + "/" + model
if region_name is not None:
model_with_provider_and_region = (
f"{custom_llm_provider}/{region_name}/{model}"
)
if (
model_with_provider_and_region in model_cost_ref
): # use region based pricing, if it's available
model_with_provider = model_with_provider_and_region
else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model
model_parts = model.split("/")
if len(model_parts) > 1:
model_without_prefix = model_parts[1]
else:
model_without_prefix = model
"""
Code block that formats model to lookup in litellm.model_cost
Option1. model = "bedrock/ap-northeast-1/anthropic.claude-instant-v1". This is the most accurate since it is region based. Should always be option 1
Option2. model = "openai/gpt-4" - model = provider/model
Option3. model = "anthropic.claude-3" - model = model
"""
if (
model_with_provider in model_cost_ref
): # Option 2. use model with provider, model = "openai/gpt-4"
model = model_with_provider
elif model in model_cost_ref: # Option 1. use model passed, model="gpt-4"
model = model
elif (
model_without_prefix in model_cost_ref
): # Option 3. if user passed model="bedrock/anthropic.claude-3", use model="anthropic.claude-3"
model = model_without_prefix
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map")
if custom_llm_provider == "vertex_ai":
return google_cost_per_character(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif custom_llm_provider == "gemini":
return google_cost_per_token(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif model in model_cost_ref:
print_verbose(f"Success: model={model} in model_cost_map")
print_verbose(
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
)
if (
model_cost_ref[model].get("input_cost_per_token", None) is not None
and model_cost_ref[model].get("output_cost_per_token", None) is not None
):
## COST PER TOKEN ##
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
elif (
model_cost_ref[model].get("output_cost_per_second", None) is not None
and response_time_ms is not None
):
print_verbose(
f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_second"]
* response_time_ms
/ 1000
)
elif (
model_cost_ref[model].get("input_cost_per_second", None) is not None
and response_time_ms is not None
):
print_verbose(
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
)
completion_tokens_cost_usd_dollar = 0.0
print_verbose(
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-3.5-turbo" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-4-0613" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-4-0613:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-4o-2024-05-13" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-4o-2024-05-13:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"]
* prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:davinci-002" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:davinci-002:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:davinci-002"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:babbage-002" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:babbage-002:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:babbage-002"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_llms:
verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
model = litellm.azure_llms[model]
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
)
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_embedding_models:
verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
model = litellm.azure_embedding_models[model]
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
else:
# if model is not in model_prices_and_context_window.json. Raise an exception-let users know
error_str = f"Model not in model_prices_and_context_window.json. You passed model={model}. Register pricing for model - https://docs.litellm.ai/docs/proxy/custom_pricing\n"
raise litellm.exceptions.NotFoundError( # type: ignore
message=error_str,
model=model,
llm_provider="",
)
# Extract the number of billion parameters from the model name
@ -147,7 +418,9 @@ def completion_cost(
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
prompt_characters = 0
completion_tokens = 0
completion_characters = 0
custom_llm_provider = None
if completion_response is not None:
# get input/output tokens from completion_response
@ -264,6 +537,30 @@ def completion_cost(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if (
custom_llm_provider is not None
and custom_llm_provider == "vertex_ai"
and completion_response is not None
and isinstance(completion_response, ModelResponse)
):
# Calculate the prompt characters + response characters
if len("messages") > 0:
prompt_string = litellm.utils.get_formatted_prompt(
data={"messages": messages}, call_type="completion"
)
else:
prompt_string = ""
prompt_characters = litellm.utils._count_characters(text=prompt_string)
completion_string = litellm.utils.get_response_string(
response_obj=completion_response
)
completion_characters = litellm.utils._count_characters(
text=completion_string
)
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
@ -276,6 +573,8 @@ def completion_cost(
region_name=region_name,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(

View file

@ -26,7 +26,7 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 401
self.message = message
self.message = "litellm.AuthenticationError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -72,7 +72,7 @@ class NotFoundError(openai.NotFoundError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 404
self.message = message
self.message = "litellm.NotFoundError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
@ -117,7 +117,7 @@ class BadRequestError(openai.BadRequestError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 400
self.message = message
self.message = "litellm.BadRequestError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
@ -162,7 +162,7 @@ class UnprocessableEntityError(openai.UnprocessableEntityError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 422
self.message = message
self.message = "litellm.UnprocessableEntityError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
@ -204,7 +204,7 @@ class Timeout(openai.APITimeoutError): # type: ignore
request=request
) # Call the base class constructor with the parameters it needs
self.status_code = 408
self.message = message
self.message = "litellm.Timeout: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
@ -241,7 +241,7 @@ class PermissionDeniedError(openai.PermissionDeniedError): # type:ignore
num_retries: Optional[int] = None,
):
self.status_code = 403
self.message = message
self.message = "litellm.PermissionDeniedError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -280,7 +280,7 @@ class RateLimitError(openai.RateLimitError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 429
self.message = message
self.message = "litellm.RateLimitError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -324,19 +324,21 @@ class ContextWindowExceededError(BadRequestError): # type: ignore
message,
model,
llm_provider,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.message = "litellm.ContextWindowExceededError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
self.response = response or httpx.Response(status_code=400, request=request)
super().__init__(
message=self.message,
model=self.model, # type: ignore
llm_provider=self.llm_provider, # type: ignore
response=response,
response=self.response,
litellm_debug_info=self.litellm_debug_info,
) # Call the base class constructor with the parameters it needs
@ -368,7 +370,7 @@ class RejectedRequestError(BadRequestError): # type: ignore
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.message = "litellm.RejectedRequestError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
@ -407,19 +409,21 @@ class ContentPolicyViolationError(BadRequestError): # type: ignore
message,
model,
llm_provider,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.message = "litellm.ContentPolicyViolationError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
self.response = response or httpx.Response(status_code=500, request=request)
super().__init__(
message=self.message,
model=self.model, # type: ignore
llm_provider=self.llm_provider, # type: ignore
response=response,
response=self.response,
litellm_debug_info=self.litellm_debug_info,
) # Call the base class constructor with the parameters it needs
@ -452,7 +456,7 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 503
self.message = message
self.message = "litellm.ServiceUnavailableError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -501,7 +505,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = 500
self.message = message
self.message = "litellm.InternalServerError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -552,7 +556,7 @@ class APIError(openai.APIError): # type: ignore
num_retries: Optional[int] = None,
):
self.status_code = status_code
self.message = message
self.message = "litellm.APIError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
@ -589,7 +593,7 @@ class APIConnectionError(openai.APIConnectionError): # type: ignore
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
):
self.message = message
self.message = "litellm.APIConnectionError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
self.status_code = 500
@ -626,7 +630,7 @@ class APIResponseValidationError(openai.APIResponseValidationError): # type: ig
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
):
self.message = message
self.message = "litellm.APIResponseValidationError: {}".format(message)
self.llm_provider = llm_provider
self.model = model
request = httpx.Request(method="POST", url="https://api.openai.com/v1")

View file

@ -226,14 +226,6 @@ def _start_clickhouse():
response = client.query("DESCRIBE default.spend_logs")
verbose_logger.debug(f"spend logs schema ={response.result_rows}")
# RUN Enterprise Clickhouse Setup
# TLDR: For Enterprise - we create views / aggregate tables for low latency reporting APIs
from litellm.proxy.enterprise.utils import _create_clickhouse_aggregate_tables
from litellm.proxy.enterprise.utils import _create_clickhouse_material_views
_create_clickhouse_aggregate_tables(client=client, table_names=table_names)
_create_clickhouse_material_views(client=client, table_names=table_names)
class ClickhouseLogger:
# Class variables or attributes

View file

@ -10,7 +10,7 @@ import traceback
class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
# Class variables or attributes
def __init__(self):
def __init__(self) -> None:
pass
def log_pre_api_call(self, model, messages, kwargs):

View file

@ -0,0 +1,136 @@
"""
Functions for sending Email Alerts
"""
import os
from typing import Optional, List
from litellm.proxy._types import WebhookEvent
import asyncio
from litellm._logging import verbose_logger, verbose_proxy_logger
# we use this for the email header, please send a test email if you change this. verify it looks good on email
LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
LITELLM_SUPPORT_CONTACT = "support@berri.ai"
async def get_all_team_member_emails(team_id: Optional[str] = None) -> list:
verbose_logger.debug(
"Email Alerting: Getting all team members for team_id=%s", team_id
)
if team_id is None:
return []
from litellm.proxy.proxy_server import premium_user, prisma_client
if prisma_client is None:
raise Exception("Not connected to DB!")
team_row = await prisma_client.db.litellm_teamtable.find_unique(
where={
"team_id": team_id,
}
)
if team_row is None:
return []
_team_members = team_row.members_with_roles
verbose_logger.debug(
"Email Alerting: Got team members for team_id=%s Team Members: %s",
team_id,
_team_members,
)
_team_member_user_ids: List[str] = []
for member in _team_members:
if member and isinstance(member, dict) and member.get("user_id") is not None:
_team_member_user_ids.append(member.get("user_id"))
sql_query = """
SELECT user_email
FROM "LiteLLM_UserTable"
WHERE user_id = ANY($1::TEXT[]);
"""
_result = await prisma_client.db.query_raw(sql_query, _team_member_user_ids)
verbose_logger.debug("Email Alerting: Got all Emails for team, emails=%s", _result)
if _result is None:
return []
emails = []
for user in _result:
if user and isinstance(user, dict) and user.get("user_email", None) is not None:
emails.append(user.get("user_email"))
return emails
async def send_team_budget_alert(webhook_event: WebhookEvent) -> bool:
"""
Send an Email Alert to All Team Members when the Team Budget is crossed
Returns -> True if sent, False if not.
"""
from litellm.proxy.utils import send_email
from litellm.proxy.proxy_server import premium_user, prisma_client
_team_id = webhook_event.team_id
team_alias = webhook_event.team_alias
verbose_logger.debug(
"Email Alerting: Sending Team Budget Alert for team=%s", team_alias
)
email_logo_url = os.getenv("SMTP_SENDER_LOGO", os.getenv("EMAIL_LOGO_URL", None))
email_support_contact = os.getenv("EMAIL_SUPPORT_CONTACT", None)
# await self._check_if_using_premium_email_feature(
# premium_user, email_logo_url, email_support_contact
# )
if email_logo_url is None:
email_logo_url = LITELLM_LOGO_URL
if email_support_contact is None:
email_support_contact = LITELLM_SUPPORT_CONTACT
recipient_emails = await get_all_team_member_emails(_team_id)
recipient_emails_str: str = ",".join(recipient_emails)
verbose_logger.debug(
"Email Alerting: Sending team budget alert to %s", recipient_emails_str
)
event_name = webhook_event.event_message
max_budget = webhook_event.max_budget
email_html_content = "Alert from LiteLLM Server"
if recipient_emails_str is None:
verbose_proxy_logger.error(
"Email Alerting: Trying to send email alert to no recipient, got recipient_emails=%s",
recipient_emails_str,
)
email_html_content = f"""
<img src="{email_logo_url}" alt="LiteLLM Logo" width="150" height="50" /> <br/><br/><br/>
Budget Crossed for Team <b> {team_alias} </b> <br/> <br/>
Your Teams LLM API usage has crossed it's <b> budget of ${max_budget} </b>, current spend is <b>${webhook_event.spend}</b><br /> <br />
API requests will be rejected until either (a) you increase your budget or (b) your budget gets reset <br /> <br />
If you have any questions, please send an email to {email_support_contact} <br /> <br />
Best, <br />
The LiteLLM team <br />
"""
email_event = {
"to": recipient_emails_str,
"subject": f"LiteLLM {event_name} for Team {team_alias}",
"html": email_html_content,
}
await send_email(
receiver_email=email_event["to"],
subject=email_event["subject"],
html=email_event["html"],
)
return False

View file

@ -1,13 +1,19 @@
# What is this?
## On Success events log cost to Lago - https://github.com/BerriAI/litellm/issues/3639
import dotenv, os, json
import json
import os
import traceback
import uuid
from typing import Literal, Optional
import dotenv
import httpx
import litellm
import traceback, httpx
from litellm import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
import uuid
from typing import Optional, Literal
def get_utc_datetime():
@ -143,6 +149,7 @@ class LagoLogger(CustomLogger):
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
verbose_logger.debug("ENTERS LAGO CALLBACK")
_url = os.getenv("LAGO_API_BASE")
assert _url is not None and isinstance(
_url, str

View file

@ -1,21 +1,27 @@
#### What this does ####
# On success, logs events to Langfuse
import os
import copy
import os
import traceback
from packaging.version import Version
from litellm._logging import verbose_logger
import litellm
from litellm._logging import verbose_logger
class LangFuseLogger:
# Class variables or attributes
def __init__(
self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
self,
langfuse_public_key=None,
langfuse_secret=None,
langfuse_host=None,
flush_interval=1,
):
try:
from langfuse import Langfuse
import langfuse
from langfuse import Langfuse
except Exception as e:
raise Exception(
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -23,7 +29,9 @@ class LangFuseLogger:
# Instance variables
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
self.langfuse_host = langfuse_host or os.getenv(
"LANGFUSE_HOST", "https://cloud.langfuse.com"
)
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
@ -167,7 +175,7 @@ class LangFuseLogger:
or isinstance(response_obj, litellm.EmbeddingResponse)
):
input = prompt
output = response_obj["data"]
output = None
elif response_obj is not None and isinstance(
response_obj, litellm.ModelResponse
):
@ -251,7 +259,7 @@ class LangFuseLogger:
input,
response_obj,
):
from langfuse.model import CreateTrace, CreateGeneration
from langfuse.model import CreateGeneration, CreateTrace
verbose_logger.warning(
"Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
@ -528,31 +536,14 @@ class LangFuseLogger:
"version": clean_metadata.pop("version", None),
}
parent_observation_id = metadata.get("parent_observation_id", None)
if parent_observation_id is not None:
generation_params["parent_observation_id"] = parent_observation_id
if supports_prompt:
user_prompt = clean_metadata.pop("prompt", None)
if user_prompt is None:
pass
elif isinstance(user_prompt, dict):
from langfuse.model import (
TextPromptClient,
ChatPromptClient,
Prompt_Text,
Prompt_Chat,
)
if user_prompt.get("type", "") == "chat":
_prompt_chat = Prompt_Chat(**user_prompt)
generation_params["prompt"] = ChatPromptClient(
prompt=_prompt_chat
)
elif user_prompt.get("type", "") == "text":
_prompt_text = Prompt_Text(**user_prompt)
generation_params["prompt"] = TextPromptClient(
prompt=_prompt_text
)
else:
generation_params["prompt"] = user_prompt
generation_params = _add_prompt_to_generation_params(
generation_params=generation_params, clean_metadata=clean_metadata
)
if output is not None and isinstance(output, str) and level == "ERROR":
generation_params["status_message"] = output
@ -565,5 +556,58 @@ class LangFuseLogger:
return generation_client.trace_id, generation_id
except Exception as e:
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
verbose_logger.error(f"Langfuse Layer Error - {traceback.format_exc()}")
return None, None
def _add_prompt_to_generation_params(
generation_params: dict, clean_metadata: dict
) -> dict:
from langfuse.model import (
ChatPromptClient,
Prompt_Chat,
Prompt_Text,
TextPromptClient,
)
user_prompt = clean_metadata.pop("prompt", None)
if user_prompt is None:
pass
elif isinstance(user_prompt, dict):
if user_prompt.get("type", "") == "chat":
_prompt_chat = Prompt_Chat(**user_prompt)
generation_params["prompt"] = ChatPromptClient(prompt=_prompt_chat)
elif user_prompt.get("type", "") == "text":
_prompt_text = Prompt_Text(**user_prompt)
generation_params["prompt"] = TextPromptClient(prompt=_prompt_text)
elif "version" in user_prompt and "prompt" in user_prompt:
# prompts
if isinstance(user_prompt["prompt"], str):
_prompt_obj = Prompt_Text(
name=user_prompt["name"],
prompt=user_prompt["prompt"],
version=user_prompt["version"],
config=user_prompt.get("config", None),
)
generation_params["prompt"] = TextPromptClient(prompt=_prompt_obj)
elif isinstance(user_prompt["prompt"], list):
_prompt_obj = Prompt_Chat(
name=user_prompt["name"],
prompt=user_prompt["prompt"],
version=user_prompt["version"],
config=user_prompt.get("config", None),
)
generation_params["prompt"] = ChatPromptClient(prompt=_prompt_obj)
else:
verbose_logger.error(
"[Non-blocking] Langfuse Logger: Invalid prompt format"
)
else:
verbose_logger.error(
"[Non-blocking] Langfuse Logger: Invalid prompt format. No prompt logged to Langfuse"
)
else:
generation_params["prompt"] = user_prompt
return generation_params

View file

@ -105,7 +105,6 @@ class LunaryLogger:
end_time=datetime.now(timezone.utc),
error=None,
):
# Method definition
try:
print_verbose(f"Lunary Logging - Logging request for model {model}")
@ -114,10 +113,9 @@ class LunaryLogger:
metadata = litellm_params.get("metadata", {}) or {}
if optional_params:
# merge into extra
extra = {**extra, **optional_params}
tags = litellm_params.pop("tags", None) or []
tags = metadata.get("tags", None)
if extra:
extra.pop("extra_body", None)

View file

@ -1,20 +1,21 @@
import os
from dataclasses import dataclass
from datetime import datetime
import litellm
from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_logger
from litellm.types.services import ServiceLoggerPayload
from functools import wraps
from typing import Union, Optional, TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Optional, Union
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.services import ServiceLoggerPayload
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
from litellm.proxy._types import (
ManagementEndpointLoggingPayload as _ManagementEndpointLoggingPayload,
)
from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
Span = _Span
UserAPIKeyAuth = _UserAPIKeyAuth
@ -107,8 +108,9 @@ class OpenTelemetry(CustomLogger):
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
):
from opentelemetry import trace
from datetime import datetime
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = start_time
@ -145,8 +147,9 @@ class OpenTelemetry(CustomLogger):
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
):
from opentelemetry import trace
from datetime import datetime
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = start_time
@ -179,8 +182,8 @@ class OpenTelemetry(CustomLogger):
async def async_post_call_failure_hook(
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
):
from opentelemetry.trace import Status, StatusCode
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
parent_otel_span = user_api_key_dict.parent_otel_span
if parent_otel_span is not None:
@ -202,8 +205,8 @@ class OpenTelemetry(CustomLogger):
parent_otel_span.end(end_time=self._to_ns(datetime.now()))
def _handle_sucess(self, kwargs, response_obj, start_time, end_time):
from opentelemetry.trace import Status, StatusCode
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
verbose_logger.debug(
"OpenTelemetry Logger: Logging kwargs: %s, OTEL config settings=%s",
@ -253,9 +256,10 @@ class OpenTelemetry(CustomLogger):
span.end(end_time=self._to_ns(end_time))
def set_tools_attributes(self, span: Span, tools):
from litellm.proxy._types import SpanAttributes
import json
from litellm.proxy._types import SpanAttributes
if not tools:
return
@ -320,7 +324,7 @@ class OpenTelemetry(CustomLogger):
)
span.set_attribute(
SpanAttributes.LLM_IS_STREAMING, optional_params.get("stream", False)
SpanAttributes.LLM_IS_STREAMING, str(optional_params.get("stream", False))
)
if optional_params.get("tools"):
@ -439,7 +443,7 @@ class OpenTelemetry(CustomLogger):
#############################################
########## LLM Response Attributes ##########
#############################################
if _raw_response:
if _raw_response and isinstance(_raw_response, str):
# cast sr -> dict
import json
@ -478,10 +482,10 @@ class OpenTelemetry(CustomLogger):
return _parent_context
def _get_span_context(self, kwargs):
from opentelemetry import trace
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator,
)
from opentelemetry import trace
litellm_params = kwargs.get("litellm_params", {}) or {}
proxy_server_request = litellm_params.get("proxy_server_request", {}) or {}
@ -505,17 +509,17 @@ class OpenTelemetry(CustomLogger):
return TraceContextTextMapPropagator().extract(carrier=carrier), None
def _get_span_processor(self):
from opentelemetry.sdk.trace.export import (
SpanExporter,
SimpleSpanProcessor,
BatchSpanProcessor,
ConsoleSpanExporter,
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter as OTLPSpanExporterGRPC,
)
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
OTLPSpanExporter as OTLPSpanExporterHTTP,
)
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter as OTLPSpanExporterGRPC,
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
ConsoleSpanExporter,
SimpleSpanProcessor,
SpanExporter,
)
verbose_logger.debug(
@ -574,8 +578,9 @@ class OpenTelemetry(CustomLogger):
logging_payload: ManagementEndpointLoggingPayload,
parent_otel_span: Optional[Span] = None,
):
from opentelemetry import trace
from datetime import datetime
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = logging_payload.start_time
@ -619,8 +624,9 @@ class OpenTelemetry(CustomLogger):
logging_payload: ManagementEndpointLoggingPayload,
parent_otel_span: Optional[Span] = None,
):
from opentelemetry import trace
from datetime import datetime
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = logging_payload.start_time

View file

@ -8,6 +8,7 @@ import traceback
import datetime, subprocess, sys
import litellm, uuid
from litellm._logging import print_verbose, verbose_logger
from typing import Optional, Union
class PrometheusLogger:
@ -17,33 +18,76 @@ class PrometheusLogger:
**kwargs,
):
try:
from prometheus_client import Counter
from prometheus_client import Counter, Gauge
self.litellm_llm_api_failed_requests_metric = Counter(
name="litellm_llm_api_failed_requests_metric",
documentation="Total number of failed LLM API calls via litellm",
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
labelnames=[
"end_user",
"hashed_api_key",
"model",
"team",
"team_alias",
"user",
],
)
self.litellm_requests_metric = Counter(
name="litellm_requests_metric",
documentation="Total number of LLM calls to litellm",
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
labelnames=[
"end_user",
"hashed_api_key",
"model",
"team",
"team_alias",
"user",
],
)
# Counter for spend
self.litellm_spend_metric = Counter(
"litellm_spend_metric",
"Total spend on LLM requests",
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
labelnames=[
"end_user",
"hashed_api_key",
"model",
"team",
"team_alias",
"user",
],
)
# Counter for total_output_tokens
self.litellm_tokens_metric = Counter(
"litellm_total_tokens",
"Total number of input + output tokens from LLM requests",
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
labelnames=[
"end_user",
"hashed_api_key",
"model",
"team",
"team_alias",
"user",
],
)
# Remaining Budget for Team
self.litellm_remaining_team_budget_metric = Gauge(
"litellm_remaining_team_budget_metric",
"Remaining budget for team",
labelnames=["team_id", "team_alias"],
)
# Remaining Budget for API Key
self.litellm_remaining_api_key_budget_metric = Gauge(
"litellm_remaining_api_key_budget_metric",
"Remaining budget for api key",
labelnames=["hashed_api_key", "api_key_alias"],
)
except Exception as e:
print_verbose(f"Got exception on init prometheus client {str(e)}")
raise e
@ -51,7 +95,9 @@ class PrometheusLogger:
async def _async_log_event(
self, kwargs, response_obj, start_time, end_time, print_verbose, user_id
):
self.log_event(kwargs, response_obj, start_time, end_time, print_verbose)
self.log_event(
kwargs, response_obj, start_time, end_time, user_id, print_verbose
)
def log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -72,9 +118,36 @@ class PrometheusLogger:
"user_api_key_user_id", None
)
user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None)
user_api_key_alias = litellm_params.get("metadata", {}).get(
"user_api_key_alias", None
)
user_api_team = litellm_params.get("metadata", {}).get(
"user_api_key_team_id", None
)
user_api_team_alias = litellm_params.get("metadata", {}).get(
"user_api_key_team_alias", None
)
_team_spend = litellm_params.get("metadata", {}).get(
"user_api_key_team_spend", None
)
_team_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_team_max_budget", None
)
_remaining_team_budget = safe_get_remaining_budget(
max_budget=_team_max_budget, spend=_team_spend
)
_api_key_spend = litellm_params.get("metadata", {}).get(
"user_api_key_spend", None
)
_api_key_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_max_budget", None
)
_remaining_api_key_budget = safe_get_remaining_budget(
max_budget=_api_key_max_budget, spend=_api_key_spend
)
if response_obj is not None:
tokens_used = response_obj.get("usage", {}).get("total_tokens", 0)
else:
@ -94,19 +167,47 @@ class PrometheusLogger:
user_api_key = hash_token(user_api_key)
self.litellm_requests_metric.labels(
end_user_id, user_api_key, model, user_api_team, user_id
end_user_id,
user_api_key,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc()
self.litellm_spend_metric.labels(
end_user_id, user_api_key, model, user_api_team, user_id
end_user_id,
user_api_key,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc(response_cost)
self.litellm_tokens_metric.labels(
end_user_id, user_api_key, model, user_api_team, user_id
end_user_id,
user_api_key,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc(tokens_used)
self.litellm_remaining_team_budget_metric.labels(
user_api_team, user_api_team_alias
).set(_remaining_team_budget)
self.litellm_remaining_api_key_budget_metric.labels(
user_api_key, user_api_key_alias
).set(_remaining_api_key_budget)
### FAILURE INCREMENT ###
if "exception" in kwargs:
self.litellm_llm_api_failed_requests_metric.labels(
end_user_id, user_api_key, model, user_api_team, user_id
end_user_id,
user_api_key,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc()
except Exception as e:
verbose_logger.error(
@ -114,3 +215,15 @@ class PrometheusLogger:
)
verbose_logger.debug(traceback.format_exc())
pass
def safe_get_remaining_budget(
max_budget: Optional[float], spend: Optional[float]
) -> float:
if max_budget is None:
return float("inf")
if spend is None:
return max_budget
return max_budget - spend

View file

@ -330,6 +330,7 @@ class SlackAlerting(CustomLogger):
messages = "Message not logged. litellm.redact_messages_in_exceptions=True"
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
alerting_metadata: dict = {}
if time_difference_float > self.alerting_threshold:
# add deployment latencies to alert
if (
@ -337,7 +338,7 @@ class SlackAlerting(CustomLogger):
and "litellm_params" in kwargs
and "metadata" in kwargs["litellm_params"]
):
_metadata = kwargs["litellm_params"]["metadata"]
_metadata: dict = kwargs["litellm_params"]["metadata"]
request_info = litellm.utils._add_key_name_and_team_to_alert(
request_info=request_info, metadata=_metadata
)
@ -349,10 +350,14 @@ class SlackAlerting(CustomLogger):
request_info += (
f"\nAvailable Deployment Latencies\n{_deployment_latency_map}"
)
if "alerting_metadata" in _metadata:
alerting_metadata = _metadata["alerting_metadata"]
await self.send_alert(
message=slow_message + request_info,
level="Low",
alert_type="llm_too_slow",
alerting_metadata=alerting_metadata,
)
async def async_update_daily_reports(
@ -540,7 +545,12 @@ class SlackAlerting(CustomLogger):
message += f"\n\nNext Run is at: `{time.time() + self.alerting_args.daily_report_frequency}`s"
# send alert
await self.send_alert(message=message, level="Low", alert_type="daily_reports")
await self.send_alert(
message=message,
level="Low",
alert_type="daily_reports",
alerting_metadata={},
)
return True
@ -582,6 +592,7 @@ class SlackAlerting(CustomLogger):
await asyncio.sleep(
self.alerting_threshold
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
alerting_metadata: dict = {}
if (
request_data is not None
and request_data.get("litellm_status", "") != "success"
@ -606,7 +617,7 @@ class SlackAlerting(CustomLogger):
):
# In hanging requests sometime it has not made it to the point where the deployment is passed to the `request_data``
# in that case we fallback to the api base set in the request metadata
_metadata = request_data["metadata"]
_metadata: dict = request_data["metadata"]
_api_base = _metadata.get("api_base", "")
request_info = litellm.utils._add_key_name_and_team_to_alert(
@ -615,6 +626,9 @@ class SlackAlerting(CustomLogger):
if _api_base is None:
_api_base = ""
if "alerting_metadata" in _metadata:
alerting_metadata = _metadata["alerting_metadata"]
request_info += f"\nAPI Base: `{_api_base}`"
# only alert hanging responses if they have not been marked as success
alerting_message = (
@ -640,6 +654,7 @@ class SlackAlerting(CustomLogger):
message=alerting_message + request_info,
level="Medium",
alert_type="llm_requests_hanging",
alerting_metadata=alerting_metadata,
)
async def failed_tracking_alert(self, error_message: str):
@ -650,7 +665,10 @@ class SlackAlerting(CustomLogger):
result = await _cache.async_get_cache(key=_cache_key)
if result is None:
await self.send_alert(
message=message, level="High", alert_type="budget_alerts"
message=message,
level="High",
alert_type="budget_alerts",
alerting_metadata={},
)
await _cache.async_set_cache(
key=_cache_key,
@ -680,7 +698,7 @@ class SlackAlerting(CustomLogger):
return
if "budget_alerts" not in self.alert_types:
return
_id: str = "default_id" # used for caching
_id: Optional[str] = "default_id" # used for caching
user_info_json = user_info.model_dump(exclude_none=True)
for k, v in user_info_json.items():
user_info_str = "\n{}: {}\n".format(k, v)
@ -751,6 +769,7 @@ class SlackAlerting(CustomLogger):
level="High",
alert_type="budget_alerts",
user_info=webhook_event,
alerting_metadata={},
)
await _cache.async_set_cache(
key=_cache_key,
@ -769,7 +788,13 @@ class SlackAlerting(CustomLogger):
response_cost: Optional[float],
max_budget: Optional[float],
):
if end_user_id is not None and token is not None and response_cost is not None:
if (
self.alerting is not None
and "webhook" in self.alerting
and end_user_id is not None
and token is not None
and response_cost is not None
):
# log customer spend
event = WebhookEvent(
spend=response_cost,
@ -941,7 +966,10 @@ class SlackAlerting(CustomLogger):
)
# send minor alert
await self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts"
message=msg,
level="Medium",
alert_type="outage_alerts",
alerting_metadata={},
)
# set to true
outage_value["minor_alert_sent"] = True
@ -963,7 +991,12 @@ class SlackAlerting(CustomLogger):
)
# send minor alert
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
await self.send_alert(
message=msg,
level="High",
alert_type="outage_alerts",
alerting_metadata={},
)
# set to true
outage_value["major_alert_sent"] = True
@ -1062,7 +1095,10 @@ class SlackAlerting(CustomLogger):
)
# send minor alert
await self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts"
message=msg,
level="Medium",
alert_type="outage_alerts",
alerting_metadata={},
)
# set to true
outage_value["minor_alert_sent"] = True
@ -1081,7 +1117,10 @@ class SlackAlerting(CustomLogger):
)
# send minor alert
await self.send_alert(
message=msg, level="High", alert_type="outage_alerts"
message=msg,
level="High",
alert_type="outage_alerts",
alerting_metadata={},
)
# set to true
outage_value["major_alert_sent"] = True
@ -1143,7 +1182,10 @@ Model Info:
"""
alert_val = self.send_alert(
message=message, level="Low", alert_type="new_model_added"
message=message,
level="Low",
alert_type="new_model_added",
alerting_metadata={},
)
if alert_val is not None and asyncio.iscoroutine(alert_val):
@ -1159,6 +1201,9 @@ Model Info:
Currently only implemented for budget alerts
Returns -> True if sent, False if not.
Raises Exception
- if WEBHOOK_URL is not set
"""
webhook_url = os.getenv("WEBHOOK_URL", None)
@ -1297,7 +1342,9 @@ Model Info:
verbose_proxy_logger.error("Error sending email alert %s", str(e))
return False
async def send_email_alert_using_smtp(self, webhook_event: WebhookEvent) -> bool:
async def send_email_alert_using_smtp(
self, webhook_event: WebhookEvent, alert_type: str
) -> bool:
"""
Sends structured Email alert to an SMTP server
@ -1306,7 +1353,6 @@ Model Info:
Returns -> True if sent, False if not.
"""
from litellm.proxy.utils import send_email
from litellm.proxy.proxy_server import premium_user, prisma_client
email_logo_url = os.getenv(
@ -1360,6 +1406,10 @@ Model Info:
subject=email_event["subject"],
html=email_event["html"],
)
if webhook_event.event_group == "team":
from litellm.integrations.email_alerting import send_team_budget_alert
await send_team_budget_alert(webhook_event=webhook_event)
return False
@ -1368,6 +1418,7 @@ Model Info:
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[AlertType],
alerting_metadata: dict,
user_info: Optional[WebhookEvent] = None,
**kwargs,
):
@ -1401,7 +1452,9 @@ Model Info:
and user_info is not None
):
# only send budget alerts over Email
await self.send_email_alert_using_smtp(webhook_event=user_info)
await self.send_email_alert_using_smtp(
webhook_event=user_info, alert_type=alert_type
)
if "slack" not in self.alerting:
return
@ -1425,6 +1478,9 @@ Model Info:
if kwargs:
for key, value in kwargs.items():
formatted_message += f"\n\n{key}: `{value}`\n\n"
if alerting_metadata:
for key, value in alerting_metadata.items():
formatted_message += f"\n\n*Alerting Metadata*: \n{key}: `{value}`\n\n"
if _proxy_base_url is not None:
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
@ -1440,7 +1496,7 @@ Model Info:
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
if slack_webhook_url is None:
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
raise ValueError("Missing SLACK_WEBHOOK_URL from environment")
payload = {"text": formatted_message}
headers = {"Content-type": "application/json"}
@ -1453,7 +1509,7 @@ Model Info:
pass
else:
verbose_proxy_logger.debug(
"Error sending slack alert. Error=", response.text
"Error sending slack alert. Error={}".format(response.text)
)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
@ -1622,6 +1678,7 @@ Model Info:
message=_weekly_spend_message,
level="Low",
alert_type="spend_reports",
alerting_metadata={},
)
except Exception as e:
verbose_proxy_logger.error("Error sending weekly spend report", e)
@ -1673,6 +1730,7 @@ Model Info:
message=_spend_message,
level="Low",
alert_type="spend_reports",
alerting_metadata={},
)
except Exception as e:
verbose_proxy_logger.error("Error sending weekly spend report", e)

View file

@ -0,0 +1,41 @@
# What is this?
## Helper utilities for the model response objects
def map_finish_reason(
finish_reason: str,
): # openai supports 5 stop sequences - 'stop', 'length', 'function_call', 'content_filter', 'null'
# anthropic mapping
if finish_reason == "stop_sequence":
return "stop"
# cohere mapping - https://docs.cohere.com/reference/generate
elif finish_reason == "COMPLETE":
return "stop"
elif finish_reason == "MAX_TOKENS": # cohere + vertex ai
return "length"
elif finish_reason == "ERROR_TOXIC":
return "content_filter"
elif (
finish_reason == "ERROR"
): # openai currently doesn't support an 'error' finish reason
return "stop"
# huggingface mapping https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate_stream
elif finish_reason == "eos_token" or finish_reason == "stop_sequence":
return "stop"
elif (
finish_reason == "FINISH_REASON_UNSPECIFIED" or finish_reason == "STOP"
): # vertex ai - got from running `print(dir(response_obj.candidates[0].finish_reason))`: ['FINISH_REASON_UNSPECIFIED', 'MAX_TOKENS', 'OTHER', 'RECITATION', 'SAFETY', 'STOP',]
return "stop"
elif finish_reason == "SAFETY": # vertex ai
return "content_filter"
elif finish_reason == "STOP": # vertex ai
return "stop"
elif finish_reason == "end_turn" or finish_reason == "stop_sequence": # anthropic
return "stop"
elif finish_reason == "max_tokens": # anthropic
return "length"
elif finish_reason == "tool_use": # anthropic
return "tool_calls"
elif finish_reason == "content_filtered":
return "content_filter"
return finish_reason

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,210 @@
# What is this?
## Cost calculation for Google AI Studio / Vertex AI models
import traceback
from typing import List, Literal, Optional, Tuple
import litellm
from litellm import verbose_logger
"""
Gemini pricing covers:
- token
- image
- audio
- video
"""
"""
Vertex AI -> character based pricing
Google AI Studio -> token based pricing
"""
models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"]
def _is_above_128k(tokens: float) -> bool:
if tokens > 128000:
return True
return False
def cost_per_character(
model: str,
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
prompt_characters: float,
completion_characters: float,
) -> Tuple[float, float]:
"""
Calculates the cost per character for a given VertexAI model, input messages, and response object.
Input:
- model: str, the model name without provider prefix
- custom_llm_provider: str, "vertex_ai-*"
- prompt_characters: float, the number of input characters
- completion_characters: float, the number of output characters
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
Raises:
Exception if model requires >128k pricing, but model cost not mapped
"""
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
## GET MODEL INFO
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
## CALCULATE INPUT COST
try:
if (
_is_above_128k(tokens=prompt_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
## check if character pricing, else default to token pricing
assert (
"input_cost_per_character_above_128k_tokens" in model_info
and model_info["input_cost_per_character_above_128k_tokens"] is not None
), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
prompt_cost = (
prompt_characters
* model_info["input_cost_per_character_above_128k_tokens"]
)
else:
assert (
"input_cost_per_character" in model_info
and model_info["input_cost_per_character"] is not None
), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
prompt_cost = prompt_characters * model_info["input_cost_per_character"]
except Exception as e:
verbose_logger.error(
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
Defaulting to (cost_per_token * 4) calculation for prompt_cost".format(
str(e), traceback.format_exc()
)
)
initial_prompt_cost, _ = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
prompt_cost = initial_prompt_cost * 4
## CALCULATE OUTPUT COST
try:
if (
_is_above_128k(tokens=completion_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
assert (
"output_cost_per_character_above_128k_tokens" in model_info
and model_info["output_cost_per_character_above_128k_tokens"]
is not None
), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens
* model_info["output_cost_per_character_above_128k_tokens"]
)
else:
assert (
"output_cost_per_character" in model_info
and model_info["output_cost_per_character"] is not None
), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens * model_info["output_cost_per_character"]
)
except Exception as e:
verbose_logger.error(
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
Defaulting to (cost_per_token * 4) calculation for completion_cost".format(
str(e), traceback.format_exc()
)
)
_, initial_completion_cost = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
completion_cost = initial_completion_cost * 4
return prompt_cost, completion_cost
def cost_per_token(
model: str,
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- custom_llm_provider: str, either "vertex_ai-*" or "gemini"
- prompt_tokens: float, the number of input tokens
- completion_tokens: float, the number of output tokens
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
Raises:
Exception if model requires >128k pricing, but model cost not mapped
"""
## GET MODEL INFO
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
## CALCULATE INPUT COST
if (
_is_above_128k(tokens=prompt_tokens)
and model not in models_without_dynamic_pricing
):
assert (
"input_cost_per_token_above_128k_tokens" in model_info
and model_info["input_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
prompt_cost = (
prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
)
else:
prompt_cost = prompt_tokens * model_info["input_cost_per_token"]
## CALCULATE OUTPUT COST
if (
_is_above_128k(tokens=completion_tokens)
and model not in models_without_dynamic_pricing
):
assert (
"output_cost_per_token_above_128k_tokens" in model_info
and model_info["output_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens * model_info["output_cost_per_token_above_128k_tokens"]
)
else:
completion_cost = completion_tokens * model_info["output_cost_per_token"]
return prompt_cost, completion_cost

View file

@ -0,0 +1,28 @@
from typing import Dict, Optional
def _ensure_extra_body_is_safe(extra_body: Optional[Dict]) -> Optional[Dict]:
"""
Ensure that the extra_body sent in the request is safe, otherwise users will see this error
"Object of type TextPromptClient is not JSON serializable
Relevant Issue: https://github.com/BerriAI/litellm/issues/4140
"""
if extra_body is None:
return None
if not isinstance(extra_body, dict):
return extra_body
if "metadata" in extra_body and isinstance(extra_body["metadata"], dict):
if "prompt" in extra_body["metadata"]:
_prompt = extra_body["metadata"].get("prompt")
# users can send Langfuse TextPromptClient objects, so we need to convert them to dicts
# Langfuse TextPromptClients have .__dict__ attribute
if _prompt is not None and hasattr(_prompt, "__dict__"):
extra_body["metadata"]["prompt"] = _prompt.__dict__
return extra_body

View file

@ -0,0 +1,71 @@
# +-----------------------------------------------+
# | |
# | Give Feedback / Get Help |
# | https://github.com/BerriAI/litellm/issues/new |
# | |
# +-----------------------------------------------+
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import copy
from typing import TYPE_CHECKING, Any
import litellm
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import (
Logging as _LiteLLMLoggingObject,
)
LiteLLMLoggingObject = _LiteLLMLoggingObject
else:
LiteLLMLoggingObject = Any
def redact_message_input_output_from_logging(
litellm_logging_obj: LiteLLMLoggingObject, result
):
"""
Removes messages, prompts, input, response from logging. This modifies the data in-place
only redacts when litellm.turn_off_message_logging == True
"""
# check if user opted out of logging message/response to callbacks
if litellm.turn_off_message_logging is not True:
return result
# remove messages, prompts, input, response from logging
litellm_logging_obj.model_call_details["messages"] = [
{"role": "user", "content": "redacted-by-litellm"}
]
litellm_logging_obj.model_call_details["prompt"] = ""
litellm_logging_obj.model_call_details["input"] = ""
# response cleaning
# ChatCompletion Responses
if (
litellm_logging_obj.stream is True
and "complete_streaming_response" in litellm_logging_obj.model_call_details
):
_streaming_response = litellm_logging_obj.model_call_details[
"complete_streaming_response"
]
for choice in _streaming_response.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
else:
if result is not None:
if isinstance(result, litellm.ModelResponse):
# only deep copy litellm.ModelResponse
_result = copy.deepcopy(result)
if hasattr(_result, "choices") and _result.choices is not None:
for choice in _result.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
return _result
# by default return result
return result

View file

@ -5,10 +5,16 @@ import requests, copy # type: ignore
import time
from functools import partial
from typing import Callable, Optional, List, Union
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
import litellm.litellm_core_utils
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
from litellm.litellm_core_utils.core_helpers import map_finish_reason
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
_get_async_httpx_client,
_get_httpx_client,
)
from .base import BaseLLM
import httpx # type: ignore
from litellm.types.llms.anthropic import AnthropicMessagesToolChoice
@ -171,7 +177,7 @@ async def make_call(
logging_obj,
):
if client is None:
client = AsyncHTTPHandler() # Create a new client if none provided
client = _get_async_httpx_client() # Create a new client if none provided
response = await client.post(api_base, headers=headers, data=data, stream=True)
@ -201,7 +207,7 @@ class AnthropicChatCompletion(BaseLLM):
response: Union[requests.Response, httpx.Response],
model_response: ModelResponse,
stream: bool,
logging_obj: litellm.utils.Logging,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],
@ -316,7 +322,7 @@ class AnthropicChatCompletion(BaseLLM):
response: Union[requests.Response, httpx.Response],
model_response: ModelResponse,
stream: bool,
logging_obj: litellm.utils.Logging,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],
@ -463,9 +469,7 @@ class AnthropicChatCompletion(BaseLLM):
logger_fn=None,
headers={},
) -> Union[ModelResponse, CustomStreamWrapper]:
async_handler = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
async_handler = _get_async_httpx_client()
response = await async_handler.post(api_base, headers=headers, json=data)
if stream and _is_function_call:
return self.process_streaming_response(

View file

@ -1,42 +1,56 @@
from typing import Optional, Union, Any, Literal, Coroutine, Iterable
from typing_extensions import overload
import types, requests
from .base import BaseLLM
from litellm.utils import (
ModelResponse,
Choices,
Message,
CustomStreamWrapper,
convert_to_model_response_object,
TranscriptionResponse,
get_secret,
UnsupportedParamsError,
)
from typing import Callable, Optional, BinaryIO, List
from litellm import OpenAIConfig
import litellm, json
import httpx # type: ignore
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
from openai import AzureOpenAI, AsyncAzureOpenAI
import uuid
import asyncio
import json
import os
import types
import uuid
from typing import (
Any,
BinaryIO,
Callable,
Coroutine,
Iterable,
List,
Literal,
Optional,
Union,
)
import httpx # type: ignore
import requests
from openai import AsyncAzureOpenAI, AzureOpenAI
from typing_extensions import overload
import litellm
from litellm import OpenAIConfig
from litellm.caching import DualCache
from litellm.utils import (
Choices,
CustomStreamWrapper,
Message,
ModelResponse,
TranscriptionResponse,
UnsupportedParamsError,
convert_to_model_response_object,
get_secret,
)
from ..types.llms.openai import (
AsyncCursorPage,
AssistantToolParam,
SyncCursorPage,
Assistant,
MessageData,
OpenAIMessage,
OpenAICreateThreadParamsMessage,
Thread,
AssistantToolParam,
Run,
AssistantEventHandler,
AssistantStreamManager,
AssistantToolParam,
AsyncAssistantEventHandler,
AsyncAssistantStreamManager,
AssistantStreamManager,
AsyncCursorPage,
MessageData,
OpenAICreateThreadParamsMessage,
OpenAIMessage,
Run,
SyncCursorPage,
Thread,
)
from litellm.caching import DualCache
from .base import BaseLLM
from .custom_httpx.azure_dall_e_2 import AsyncCustomHTTPTransport, CustomHTTPTransport
azure_ad_cache = DualCache()
@ -313,7 +327,9 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):
def get_azure_ad_token_from_oidc(azure_ad_token: str):
azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
azure_authority_host = os.getenv("AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com")
azure_authority_host = os.getenv(
"AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
)
if azure_client_id is None or azure_tenant_id is None:
raise AzureOpenAIError(
@ -329,12 +345,14 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
message="OIDC token could not be retrieved from secret manager.",
)
azure_ad_token_cache_key = json.dumps({
"azure_client_id": azure_client_id,
"azure_tenant_id": azure_tenant_id,
"azure_authority_host": azure_authority_host,
"oidc_token": oidc_token,
})
azure_ad_token_cache_key = json.dumps(
{
"azure_client_id": azure_client_id,
"azure_tenant_id": azure_tenant_id,
"azure_authority_host": azure_authority_host,
"oidc_token": oidc_token,
}
)
azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
if azure_ad_token_access_token is not None:
@ -371,7 +389,11 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
status_code=422, message="Azure AD Token expires_in not returned"
)
azure_ad_cache.set_cache(key=azure_ad_token_cache_key, value=azure_ad_token_access_token, ttl=azure_ad_token_expires_in)
azure_ad_cache.set_cache(
key=azure_ad_token_cache_key,
value=azure_ad_token_access_token,
ttl=azure_ad_token_expires_in,
)
return azure_ad_token_access_token
@ -645,6 +667,8 @@ class AzureChatCompletion(BaseLLM):
except AzureOpenAIError as e:
exception_mapping_worked = True
raise e
except asyncio.CancelledError as e:
raise AzureOpenAIError(status_code=500, message=str(e))
except Exception as e:
if hasattr(e, "status_code"):
raise e

View file

@ -2,7 +2,7 @@
import litellm
import httpx, requests
from typing import Optional, Union
from litellm.utils import Logging
from litellm.litellm_core_utils.litellm_logging import Logging
class BaseLLM:
@ -27,6 +27,25 @@ class BaseLLM:
"""
return model_response
def process_text_completion_response(
self,
model: str,
response: Union[requests.Response, httpx.Response],
model_response: litellm.utils.TextCompletionResponse,
stream: bool,
logging_obj: Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],
messages: list,
print_verbose,
encoding,
) -> Union[litellm.utils.TextCompletionResponse, litellm.utils.CustomStreamWrapper]:
"""
Helper function to process the response across sync + async completion calls
"""
return model_response
def create_client_session(self):
if litellm.client_session:
_client_session = litellm.client_session

View file

@ -1,25 +1,27 @@
import json, copy, types
import copy
import json
import os
import time
import types
import uuid
from enum import Enum
import time, uuid
from typing import Callable, Optional, Any, Union, List
from typing import Any, Callable, List, Optional, Union
import httpx
import litellm
from litellm.utils import (
ModelResponse,
get_secret,
Usage,
ImageResponse,
map_finish_reason,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.types.utils import ImageResponse, ModelResponse, Usage
from litellm.utils import get_secret
from .prompt_templates.factory import (
prompt_factory,
custom_prompt,
construct_tool_use_system_prompt,
contains_tag,
custom_prompt,
extract_between_tags,
parse_xml_params,
contains_tag,
prompt_factory,
)
import httpx
class BedrockError(Exception):
@ -633,7 +635,11 @@ def init_bedrock_client(
config = boto3.session.Config()
### CHECK STS ###
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
if (
aws_web_identity_token is not None
and aws_role_name is not None
and aws_session_name is not None
):
oidc_token = get_secret(aws_web_identity_token)
if oidc_token is None:
@ -642,9 +648,7 @@ def init_bedrock_client(
status_code=401,
)
sts_client = boto3.client(
"sts"
)
sts_client = boto3.client("sts")
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
@ -726,38 +730,31 @@ def init_bedrock_client(
def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
# handle anthropic prompts and amazon titan prompts
if provider == "anthropic" or provider == "amazon":
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details["roles"],
initial_prompt_value=model_prompt_details["initial_prompt_value"],
final_prompt_value=model_prompt_details["final_prompt_value"],
messages=messages,
)
else:
chat_template_provider = ["anthropic", "amazon", "mistral", "meta"]
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details["roles"],
initial_prompt_value=model_prompt_details["initial_prompt_value"],
final_prompt_value=model_prompt_details["final_prompt_value"],
messages=messages,
)
else:
if provider in chat_template_provider:
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
elif provider == "mistral":
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
elif provider == "meta":
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
else:
prompt = ""
for message in messages:
if "role" in message:
if message["role"] == "user":
prompt += f"{message['content']}"
else:
prompt = ""
for message in messages:
if "role" in message:
if message["role"] == "user":
prompt += f"{message['content']}"
else:
prompt += f"{message['content']}"
else:
prompt += f"{message['content']}"
else:
prompt += f"{message['content']}"
return prompt

View file

@ -22,13 +22,12 @@ from typing import (
from litellm.utils import (
ModelResponse,
Usage,
map_finish_reason,
CustomStreamWrapper,
Message,
Choices,
get_secret,
Logging,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.litellm_logging import Logging
from litellm.types.utils import Message, Choices
import litellm, uuid
from .prompt_templates.factory import (
prompt_factory,
@ -41,7 +40,12 @@ from .prompt_templates.factory import (
_bedrock_converse_messages_pt,
_bedrock_tools_pt,
)
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
_get_async_httpx_client,
_get_httpx_client,
)
from .base import BaseLLM
import httpx # type: ignore
from .bedrock import BedrockError, convert_messages_to_prompt, ModelResponseIterator
@ -57,6 +61,7 @@ from litellm.caching import DualCache
iam_cache = DualCache()
class AmazonCohereChatConfig:
"""
Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-cohere-command-r-plus.html
@ -167,7 +172,7 @@ async def make_call(
logging_obj,
):
if client is None:
client = AsyncHTTPHandler() # Create a new client if none provided
client = _get_async_httpx_client() # Create a new client if none provided
response = await client.post(api_base, headers=headers, data=data, stream=True)
@ -198,7 +203,7 @@ def make_sync_call(
logging_obj,
):
if client is None:
client = HTTPHandler() # Create a new client if none provided
client = _get_httpx_client() # Create a new client if none provided
response = client.post(api_base, headers=headers, data=data, stream=True)
@ -327,13 +332,19 @@ class BedrockLLM(BaseLLM):
) = params_to_check
### CHECK STS ###
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
iam_creds_cache_key = json.dumps({
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
})
if (
aws_web_identity_token is not None
and aws_role_name is not None
and aws_session_name is not None
):
iam_creds_cache_key = json.dumps(
{
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
}
)
iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
if iam_creds_dict is None:
@ -348,7 +359,7 @@ class BedrockLLM(BaseLLM):
sts_client = boto3.client(
"sts",
region_name=aws_region_name,
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
)
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -362,12 +373,18 @@ class BedrockLLM(BaseLLM):
iam_creds_dict = {
"aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
"aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
"aws_secret_access_key": sts_response["Credentials"][
"SecretAccessKey"
],
"aws_session_token": sts_response["Credentials"]["SessionToken"],
"region_name": aws_region_name,
}
iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
iam_cache.set_cache(
key=iam_creds_cache_key,
value=json.dumps(iam_creds_dict),
ttl=3600 - 60,
)
session = boto3.Session(**iam_creds_dict)
@ -976,7 +993,7 @@ class BedrockLLM(BaseLLM):
if isinstance(timeout, float) or isinstance(timeout, int):
timeout = httpx.Timeout(timeout)
_params["timeout"] = timeout
self.client = HTTPHandler(**_params) # type: ignore
self.client = _get_httpx_client(_params) # type: ignore
else:
self.client = client
if (stream is not None and stream == True) and provider != "ai21":
@ -1058,7 +1075,7 @@ class BedrockLLM(BaseLLM):
if isinstance(timeout, float) or isinstance(timeout, int):
timeout = httpx.Timeout(timeout)
_params["timeout"] = timeout
client = AsyncHTTPHandler(**_params) # type: ignore
client = _get_async_httpx_client(_params) # type: ignore
else:
client = client # type: ignore
@ -1433,13 +1450,19 @@ class BedrockConverseLLM(BaseLLM):
) = params_to_check
### CHECK STS ###
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
iam_creds_cache_key = json.dumps({
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
})
if (
aws_web_identity_token is not None
and aws_role_name is not None
and aws_session_name is not None
):
iam_creds_cache_key = json.dumps(
{
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
}
)
iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
if iam_creds_dict is None:
@ -1454,7 +1477,7 @@ class BedrockConverseLLM(BaseLLM):
sts_client = boto3.client(
"sts",
region_name=aws_region_name,
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
)
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -1468,12 +1491,18 @@ class BedrockConverseLLM(BaseLLM):
iam_creds_dict = {
"aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
"aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
"aws_secret_access_key": sts_response["Credentials"][
"SecretAccessKey"
],
"aws_session_token": sts_response["Credentials"]["SessionToken"],
"region_name": aws_region_name,
}
iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
iam_cache.set_cache(
key=iam_creds_cache_key,
value=json.dumps(iam_creds_dict),
ttl=3600 - 60,
)
session = boto3.Session(**iam_creds_dict)
@ -1575,7 +1604,7 @@ class BedrockConverseLLM(BaseLLM):
if isinstance(timeout, float) or isinstance(timeout, int):
timeout = httpx.Timeout(timeout)
_params["timeout"] = timeout
client = AsyncHTTPHandler(**_params) # type: ignore
client = _get_async_httpx_client(_params) # type: ignore
else:
client = client # type: ignore
@ -1847,7 +1876,7 @@ class BedrockConverseLLM(BaseLLM):
if isinstance(timeout, float) or isinstance(timeout, int):
timeout = httpx.Timeout(timeout)
_params["timeout"] = timeout
client = HTTPHandler(**_params) # type: ignore
client = _get_httpx_client(_params) # type: ignore
else:
client = client
try:

View file

@ -12,6 +12,15 @@ class AsyncHTTPHandler:
timeout: Optional[Union[float, httpx.Timeout]] = None,
concurrent_limit=1000,
):
self.timeout = timeout
self.client = self.create_client(
timeout=timeout, concurrent_limit=concurrent_limit
)
def create_client(
self, timeout: Optional[Union[float, httpx.Timeout]], concurrent_limit: int
) -> httpx.AsyncClient:
async_proxy_mounts = None
# Check if the HTTP_PROXY and HTTPS_PROXY environment variables are set and use them accordingly.
http_proxy = os.getenv("HTTP_PROXY", None)
@ -39,7 +48,8 @@ class AsyncHTTPHandler:
if timeout is None:
timeout = _DEFAULT_TIMEOUT
# Create a client with a connection pool
self.client = httpx.AsyncClient(
return httpx.AsyncClient(
timeout=timeout,
limits=httpx.Limits(
max_connections=concurrent_limit,
@ -83,11 +93,48 @@ class AsyncHTTPHandler:
response = await self.client.send(req, stream=stream)
response.raise_for_status()
return response
except httpx.RemoteProtocolError:
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=self.timeout, concurrent_limit=1)
try:
return await self.single_connection_post_request(
url=url,
client=new_client,
data=data,
json=json,
params=params,
headers=headers,
stream=stream,
)
finally:
await new_client.aclose()
except httpx.HTTPStatusError as e:
raise e
except Exception as e:
raise e
async def single_connection_post_request(
self,
url: str,
client: httpx.AsyncClient,
data: Optional[Union[dict, str]] = None, # type: ignore
json: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
stream: bool = False,
):
"""
Making POST request for a single connection client.
Used for retrying connection client errors.
"""
req = client.build_request(
"POST", url, data=data, json=json, params=params, headers=headers # type: ignore
)
response = await client.send(req, stream=stream)
response.raise_for_status()
return response
def __del__(self) -> None:
try:
asyncio.get_running_loop().create_task(self.close())
@ -172,3 +219,60 @@ class HTTPHandler:
self.close()
except Exception:
pass
def _get_async_httpx_client(params: Optional[dict] = None) -> AsyncHTTPHandler:
"""
Retrieves the async HTTP client from the cache
If not present, creates a new client
Caches the new client and returns it.
"""
_params_key_name = ""
if params is not None:
for key, value in params.items():
try:
_params_key_name += f"{key}_{value}"
except Exception:
pass
_cache_key_name = "async_httpx_client" + _params_key_name
if _cache_key_name in litellm.in_memory_llm_clients_cache:
return litellm.in_memory_llm_clients_cache[_cache_key_name]
if params is not None:
_new_client = AsyncHTTPHandler(**params)
else:
_new_client = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
return _new_client
def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
"""
Retrieves the HTTP client from the cache
If not present, creates a new client
Caches the new client and returns it.
"""
_params_key_name = ""
if params is not None:
for key, value in params.items():
try:
_params_key_name += f"{key}_{value}"
except Exception:
pass
_cache_key_name = "httpx_client" + _params_key_name
if _cache_key_name in litellm.in_memory_llm_clients_cache:
return litellm.in_memory_llm_clients_cache[_cache_key_name]
if params is not None:
_new_client = HTTPHandler(**params)
else:
_new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
return _new_client

View file

@ -10,10 +10,10 @@ from typing import Callable, Optional, List, Union, Tuple, Literal
from litellm.utils import (
ModelResponse,
Usage,
map_finish_reason,
CustomStreamWrapper,
EmbeddingResponse,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
@ -289,7 +289,7 @@ class DatabricksChatCompletion(BaseLLM):
response: Union[requests.Response, httpx.Response],
model_response: ModelResponse,
stream: bool,
logging_obj: litellm.utils.Logging,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],

View file

@ -1,14 +1,22 @@
import types
import traceback
####################################
######### DEPRECATED FILE ##########
####################################
# logic moved to `vertex_httpx.py` #
import copy
import time
import traceback
import types
from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm
import httpx
from .prompt_templates.factory import prompt_factory, custom_prompt, get_system_prompt
from packaging.version import Version
import litellm
from litellm import verbose_logger
from litellm.utils import Choices, Message, ModelResponse, Usage
from .prompt_templates.factory import custom_prompt, get_system_prompt, prompt_factory
class GeminiError(Exception):
@ -186,8 +194,8 @@ def completion(
if _system_instruction and len(system_prompt) > 0:
_params["system_instruction"] = system_prompt
_model = genai.GenerativeModel(**_params)
if stream == True:
if acompletion == True:
if stream is True:
if acompletion is True:
async def async_streaming():
try:

View file

@ -1,33 +1,41 @@
import hashlib
import json
import time
import traceback
import types
from typing import (
Optional,
Union,
Any,
BinaryIO,
Literal,
Callable,
Coroutine,
Iterable,
Literal,
Optional,
Union,
)
import hashlib
from typing_extensions import override, overload
from pydantic import BaseModel
import types, time, json, traceback
import httpx
from .base import BaseLLM
from litellm.utils import (
ModelResponse,
Choices,
Message,
CustomStreamWrapper,
convert_to_model_response_object,
Usage,
TranscriptionResponse,
TextCompletionResponse,
)
from typing import Callable, Optional, Coroutine
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from openai import OpenAI, AsyncOpenAI
from ..types.llms.openai import *
import openai
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel
from typing_extensions import overload, override
import litellm
from litellm.types.utils import ProviderField
from litellm.utils import (
Choices,
CustomStreamWrapper,
Message,
ModelResponse,
TextCompletionResponse,
TranscriptionResponse,
Usage,
convert_to_model_response_object,
)
from ..types.llms.openai import *
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
class OpenAIError(Exception):
@ -207,6 +215,25 @@ class MistralEmbeddingConfig:
return optional_params
class AzureAIStudioConfig:
def get_required_params(self) -> List[ProviderField]:
"""For a given provider, return it's required fields with a description"""
return [
ProviderField(
field_name="api_key",
field_type="string",
field_description="Your Azure AI Studio API Key.",
field_value="zEJ...",
),
ProviderField(
field_name="api_base",
field_type="string",
field_description="Your Azure AI Studio API Base.",
field_value="https://Mistral-serverless.",
),
]
class DeepInfraConfig:
"""
Reference: https://deepinfra.com/docs/advanced/openai_api
@ -286,8 +313,12 @@ class DeepInfraConfig:
]
def map_openai_params(
self, non_default_params: dict, optional_params: dict, model: str
):
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
supported_openai_params = self.get_supported_openai_params()
for param, value in non_default_params.items():
if (
@ -296,8 +327,23 @@ class DeepInfraConfig:
and model == "mistralai/Mistral-7B-Instruct-v0.1"
): # this model does no support temperature == 0
value = 0.0001 # close to 0
if param == "tool_choice":
if (
value != "auto" and value != "none"
): # https://deepinfra.com/docs/advanced/function_calling
## UNSUPPORTED TOOL CHOICE VALUE
if litellm.drop_params is True or drop_params is True:
value = None
else:
raise litellm.utils.UnsupportedParamsError(
message="Deepinfra doesn't support tool_choice={}. To drop unsupported openai params from the call, set `litellm.drop_params = True`".format(
value
),
status_code=400,
)
if param in supported_openai_params:
optional_params[param] = value
if value is not None:
optional_params[param] = value
return optional_params
@ -1530,6 +1576,7 @@ class OpenAITextCompletion(BaseLLM):
response = openai_client.completions.create(**data) # type: ignore
response_json = response.model_dump()
## LOGGING
logging_obj.post_call(
input=prompt,

View file

@ -12,11 +12,11 @@ from typing import Callable, Optional, List, Literal, Union
from litellm.utils import (
ModelResponse,
Usage,
map_finish_reason,
CustomStreamWrapper,
Message,
Choices,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
@ -198,7 +198,7 @@ class PredibaseChatCompletion(BaseLLM):
response: Union[requests.Response, httpx.Response],
model_response: ModelResponse,
stream: bool,
logging_obj: litellm.utils.Logging,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],

View file

@ -1,24 +1,30 @@
import json
import re
import traceback
import uuid
import xml.etree.ElementTree as ET
from enum import Enum
import requests, traceback
import json, re, xml.etree.ElementTree as ET
from jinja2 import Template, exceptions, meta, BaseLoader
from jinja2.sandbox import ImmutableSandboxedEnvironment
from typing import Any, List, Mapping, MutableMapping, Optional, Sequence, Tuple
import requests
from jinja2 import BaseLoader, Template, exceptions, meta
from jinja2.sandbox import ImmutableSandboxedEnvironment
import litellm
import litellm.types
from litellm.types.completion import (
ChatCompletionUserMessageParam,
ChatCompletionSystemMessageParam,
ChatCompletionMessageParam,
ChatCompletionFunctionMessageParam,
ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam,
)
import litellm.types.llms
from litellm.types.llms.anthropic import *
import uuid
from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
import litellm.types.llms.vertex_ai
from litellm.types.completion import (
ChatCompletionFunctionMessageParam,
ChatCompletionMessageParam,
ChatCompletionMessageToolCallParam,
ChatCompletionSystemMessageParam,
ChatCompletionToolMessageParam,
ChatCompletionUserMessageParam,
)
from litellm.types.llms.anthropic import *
from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
from litellm.types.utils import GenericImageParsingChunk
def default_pt(messages):
@ -622,9 +628,10 @@ def construct_tool_use_system_prompt(
def convert_url_to_base64(url):
import requests
import base64
import requests
for _ in range(3):
try:
response = requests.get(url)
@ -654,7 +661,7 @@ def convert_url_to_base64(url):
raise Exception(f"Error: Unable to fetch image from URL. url={url}")
def convert_to_anthropic_image_obj(openai_image_url: str):
def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
"""
Input:
"image_url": "data:image/jpeg;base64,{base64_image}",
@ -675,11 +682,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str):
# Infer image format from the URL
image_format = openai_image_url.split("data:image/")[1].split(";base64,")[0]
return {
"type": "base64",
"media_type": f"image/{image_format}",
"data": base64_data,
}
return GenericImageParsingChunk(
type="base64",
media_type=f"image/{image_format}",
data=base64_data,
)
except Exception as e:
if "Error: Unable to fetch image from URL" in str(e):
raise e
@ -1606,19 +1613,23 @@ def azure_text_pt(messages: list):
###### AMAZON BEDROCK #######
from litellm.types.llms.bedrock import ContentBlock as BedrockContentBlock
from litellm.types.llms.bedrock import ImageBlock as BedrockImageBlock
from litellm.types.llms.bedrock import ImageSourceBlock as BedrockImageSourceBlock
from litellm.types.llms.bedrock import ToolBlock as BedrockToolBlock
from litellm.types.llms.bedrock import (
ToolResultContentBlock as BedrockToolResultContentBlock,
ToolResultBlock as BedrockToolResultBlock,
ToolConfigBlock as BedrockToolConfigBlock,
ToolUseBlock as BedrockToolUseBlock,
ImageSourceBlock as BedrockImageSourceBlock,
ImageBlock as BedrockImageBlock,
ContentBlock as BedrockContentBlock,
ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
ToolSpecBlock as BedrockToolSpecBlock,
ToolBlock as BedrockToolBlock,
ToolChoiceValuesBlock as BedrockToolChoiceValuesBlock,
)
from litellm.types.llms.bedrock import ToolConfigBlock as BedrockToolConfigBlock
from litellm.types.llms.bedrock import (
ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
)
from litellm.types.llms.bedrock import ToolResultBlock as BedrockToolResultBlock
from litellm.types.llms.bedrock import (
ToolResultContentBlock as BedrockToolResultContentBlock,
)
from litellm.types.llms.bedrock import ToolSpecBlock as BedrockToolSpecBlock
from litellm.types.llms.bedrock import ToolUseBlock as BedrockToolUseBlock
def get_image_details(image_url) -> Tuple[str, str]:
@ -1655,7 +1666,8 @@ def get_image_details(image_url) -> Tuple[str, str]:
def _process_bedrock_converse_image_block(image_url: str) -> BedrockImageBlock:
if "base64" in image_url:
# Case 1: Images with base64 encoding
import base64, re
import base64
import re
# base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
image_metadata, img_without_base_64 = image_url.split(",")

View file

@ -0,0 +1,532 @@
# What is this?
## Controller file for TextCompletionCodestral Integration - https://codestral.com/
from functools import partial
import os, types
import traceback
import json
from enum import Enum
import requests, copy # type: ignore
import time
from typing import Callable, Optional, List, Literal, Union
from litellm.utils import (
TextCompletionResponse,
Usage,
CustomStreamWrapper,
Message,
Choices,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.types.llms.databricks import GenericStreamingChunk
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from .base import BaseLLM
import httpx # type: ignore
class TextCompletionCodestralError(Exception):
def __init__(
self,
status_code,
message,
request: Optional[httpx.Request] = None,
response: Optional[httpx.Response] = None,
):
self.status_code = status_code
self.message = message
if request is not None:
self.request = request
else:
self.request = httpx.Request(
method="POST",
url="https://docs.codestral.com/user-guide/inference/rest_api",
)
if response is not None:
self.response = response
else:
self.response = httpx.Response(
status_code=status_code, request=self.request
)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
async def make_call(
client: AsyncHTTPHandler,
api_base: str,
headers: dict,
data: str,
model: str,
messages: list,
logging_obj,
):
response = await client.post(api_base, headers=headers, data=data, stream=True)
if response.status_code != 200:
raise TextCompletionCodestralError(
status_code=response.status_code, message=response.text
)
completion_stream = response.aiter_lines()
# LOGGING
logging_obj.post_call(
input=messages,
api_key="",
original_response=completion_stream, # Pass the completion stream for logging
additional_args={"complete_input_dict": data},
)
return completion_stream
class MistralTextCompletionConfig:
"""
Reference: https://docs.mistral.ai/api/#operation/createFIMCompletion
"""
suffix: Optional[str] = None
temperature: Optional[int] = None
top_p: Optional[float] = None
max_tokens: Optional[int] = None
min_tokens: Optional[int] = None
stream: Optional[bool] = None
random_seed: Optional[int] = None
stop: Optional[str] = None
def __init__(
self,
suffix: Optional[str] = None,
temperature: Optional[int] = None,
top_p: Optional[float] = None,
max_tokens: Optional[int] = None,
min_tokens: Optional[int] = None,
stream: Optional[bool] = None,
random_seed: Optional[int] = None,
stop: Optional[str] = None,
) -> None:
locals_ = locals().copy()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(self):
return [
"suffix",
"temperature",
"top_p",
"max_tokens",
"stream",
"seed",
"stop",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "suffix":
optional_params["suffix"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
if param == "max_tokens":
optional_params["max_tokens"] = value
if param == "stream" and value == True:
optional_params["stream"] = value
if param == "stop":
optional_params["stop"] = value
if param == "seed":
optional_params["random_seed"] = value
if param == "min_tokens":
optional_params["min_tokens"] = value
return optional_params
def _chunk_parser(self, chunk_data: str) -> GenericStreamingChunk:
text = ""
is_finished = False
finish_reason = None
logprobs = None
chunk_data = chunk_data.replace("data:", "")
chunk_data = chunk_data.strip()
if len(chunk_data) == 0 or chunk_data == "[DONE]":
return {
"text": "",
"is_finished": is_finished,
"finish_reason": finish_reason,
}
chunk_data_dict = json.loads(chunk_data)
original_chunk = litellm.ModelResponse(**chunk_data_dict, stream=True)
_choices = chunk_data_dict.get("choices", []) or []
_choice = _choices[0]
text = _choice.get("delta", {}).get("content", "")
if _choice.get("finish_reason") is not None:
is_finished = True
finish_reason = _choice.get("finish_reason")
logprobs = _choice.get("logprobs")
return GenericStreamingChunk(
text=text,
original_chunk=original_chunk,
is_finished=is_finished,
finish_reason=finish_reason,
logprobs=logprobs,
)
class CodestralTextCompletion(BaseLLM):
def __init__(self) -> None:
super().__init__()
def _validate_environment(
self,
api_key: Optional[str],
user_headers: dict,
) -> dict:
if api_key is None:
raise ValueError(
"Missing CODESTRAL_API_Key - Please add CODESTRAL_API_Key to your environment variables"
)
headers = {
"content-type": "application/json",
"Authorization": "Bearer {}".format(api_key),
}
if user_headers is not None and isinstance(user_headers, dict):
headers = {**headers, **user_headers}
return headers
def output_parser(self, generated_text: str):
"""
Parse the output text to remove any special characters. In our current approach we just check for ChatML tokens.
Initial issue that prompted this - https://github.com/BerriAI/litellm/issues/763
"""
chat_template_tokens = [
"<|assistant|>",
"<|system|>",
"<|user|>",
"<s>",
"</s>",
]
for token in chat_template_tokens:
if generated_text.strip().startswith(token):
generated_text = generated_text.replace(token, "", 1)
if generated_text.endswith(token):
generated_text = generated_text[::-1].replace(token[::-1], "", 1)[::-1]
return generated_text
def process_text_completion_response(
self,
model: str,
response: Union[requests.Response, httpx.Response],
model_response: TextCompletionResponse,
stream: bool,
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
optional_params: dict,
api_key: str,
data: Union[dict, str],
messages: list,
print_verbose,
encoding,
) -> TextCompletionResponse:
## LOGGING
logging_obj.post_call(
input=messages,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data},
)
print_verbose(f"codestral api: raw model_response: {response.text}")
## RESPONSE OBJECT
if response.status_code != 200:
raise TextCompletionCodestralError(
message=str(response.text),
status_code=response.status_code,
)
try:
completion_response = response.json()
except:
raise TextCompletionCodestralError(message=response.text, status_code=422)
_original_choices = completion_response.get("choices", [])
_choices: List[litellm.utils.TextChoices] = []
for choice in _original_choices:
# This is what 1 choice looks like from codestral API
# {
# "index": 0,
# "message": {
# "role": "assistant",
# "content": "\n assert is_odd(1)\n assert",
# "tool_calls": null
# },
# "finish_reason": "length",
# "logprobs": null
# }
_finish_reason = None
_index = 0
_text = None
_logprobs = None
_choice_message = choice.get("message", {})
_choice = litellm.utils.TextChoices(
finish_reason=choice.get("finish_reason"),
index=choice.get("index"),
text=_choice_message.get("content"),
logprobs=choice.get("logprobs"),
)
_choices.append(_choice)
_response = litellm.TextCompletionResponse(
id=completion_response.get("id"),
choices=_choices,
created=completion_response.get("created"),
model=completion_response.get("model"),
usage=completion_response.get("usage"),
stream=False,
object=completion_response.get("object"),
)
return _response
def completion(
self,
model: str,
messages: list,
api_base: str,
custom_prompt_dict: dict,
model_response: TextCompletionResponse,
print_verbose: Callable,
encoding,
api_key: str,
logging_obj,
optional_params: dict,
timeout: Union[float, httpx.Timeout],
acompletion=None,
litellm_params=None,
logger_fn=None,
headers: dict = {},
) -> Union[TextCompletionResponse, CustomStreamWrapper]:
headers = self._validate_environment(api_key, headers)
completion_url = api_base or "https://codestral.mistral.ai/v1/fim/completions"
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details["roles"],
initial_prompt_value=model_prompt_details["initial_prompt_value"],
final_prompt_value=model_prompt_details["final_prompt_value"],
messages=messages,
)
else:
prompt = prompt_factory(model=model, messages=messages)
## Load Config
config = litellm.MistralTextCompletionConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
stream = optional_params.pop("stream", False)
data = {
"prompt": prompt,
**optional_params,
}
input_text = prompt
## LOGGING
logging_obj.pre_call(
input=input_text,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": completion_url,
"acompletion": acompletion,
},
)
## COMPLETION CALL
if acompletion is True:
### ASYNC STREAMING
if stream is True:
return self.async_streaming(
model=model,
messages=messages,
data=data,
api_base=completion_url,
model_response=model_response,
print_verbose=print_verbose,
encoding=encoding,
api_key=api_key,
logging_obj=logging_obj,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
) # type: ignore
else:
### ASYNC COMPLETION
return self.async_completion(
model=model,
messages=messages,
data=data,
api_base=completion_url,
model_response=model_response,
print_verbose=print_verbose,
encoding=encoding,
api_key=api_key,
logging_obj=logging_obj,
optional_params=optional_params,
stream=False,
litellm_params=litellm_params,
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
) # type: ignore
### SYNC STREAMING
if stream is True:
response = requests.post(
completion_url,
headers=headers,
data=json.dumps(data),
stream=stream,
)
_response = CustomStreamWrapper(
response.iter_lines(),
model,
custom_llm_provider="codestral",
logging_obj=logging_obj,
)
return _response
### SYNC COMPLETION
else:
response = requests.post(
url=completion_url,
headers=headers,
data=json.dumps(data),
)
return self.process_text_completion_response(
model=model,
response=response,
model_response=model_response,
stream=optional_params.get("stream", False),
logging_obj=logging_obj, # type: ignore
optional_params=optional_params,
api_key=api_key,
data=data,
messages=messages,
print_verbose=print_verbose,
encoding=encoding,
)
async def async_completion(
self,
model: str,
messages: list,
api_base: str,
model_response: TextCompletionResponse,
print_verbose: Callable,
encoding,
api_key,
logging_obj,
stream,
data: dict,
optional_params: dict,
timeout: Union[float, httpx.Timeout],
litellm_params=None,
logger_fn=None,
headers={},
) -> TextCompletionResponse:
async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
try:
response = await async_handler.post(
api_base, headers=headers, data=json.dumps(data)
)
except httpx.HTTPStatusError as e:
raise TextCompletionCodestralError(
status_code=e.response.status_code,
message="HTTPStatusError - {}".format(e.response.text),
)
except Exception as e:
raise TextCompletionCodestralError(
status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
)
return self.process_text_completion_response(
model=model,
response=response,
model_response=model_response,
stream=stream,
logging_obj=logging_obj,
api_key=api_key,
data=data,
messages=messages,
print_verbose=print_verbose,
optional_params=optional_params,
encoding=encoding,
)
async def async_streaming(
self,
model: str,
messages: list,
api_base: str,
model_response: TextCompletionResponse,
print_verbose: Callable,
encoding,
api_key,
logging_obj,
data: dict,
timeout: Union[float, httpx.Timeout],
optional_params=None,
litellm_params=None,
logger_fn=None,
headers={},
) -> CustomStreamWrapper:
data["stream"] = True
streamwrapper = CustomStreamWrapper(
completion_stream=None,
make_call=partial(
make_call,
api_base=api_base,
headers=headers,
data=json.dumps(data),
model=model,
messages=messages,
logging_obj=logging_obj,
),
model=model,
custom_llm_provider="text-completion-codestral",
logging_obj=logging_obj,
)
return streamwrapper
def embedding(self, *args, **kwargs):
pass

View file

@ -4,7 +4,6 @@ from enum import Enum
import requests, copy # type: ignore
import time
from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler

View file

@ -1,17 +1,22 @@
import os, types
import inspect
import json
from enum import Enum
import requests # type: ignore
import os
import time
from typing import Callable, Optional, Union, List, Literal, Any
import types
import uuid
from enum import Enum
from typing import Any, Callable, List, Literal, Optional, Union
import httpx # type: ignore
import requests # type: ignore
from pydantic import BaseModel
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
import litellm, uuid
import httpx, inspect # type: ignore
from litellm.types.llms.vertex_ai import *
import litellm
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.prompt_templates.factory import (
convert_to_gemini_tool_call_result,
convert_to_anthropic_image_obj,
convert_to_gemini_tool_call_invoke,
convert_to_gemini_tool_call_result,
)
from litellm.types.files import (
get_file_mime_type_for_file_type,
@ -19,6 +24,8 @@ from litellm.types.files import (
is_gemini_1_5_accepted_file_type,
is_video_file_type,
)
from litellm.types.llms.vertex_ai import *
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
class VertexAIError(Exception):
@ -273,28 +280,6 @@ def _get_image_bytes_from_url(image_url: str) -> bytes:
raise Exception(f"An exception occurs with this image - {str(e)}")
def _load_image_from_url(image_url: str):
"""
Loads an image from a URL.
Args:
image_url (str): The URL of the image.
Returns:
Image: The loaded image.
"""
from vertexai.preview.generative_models import (
GenerativeModel,
Part,
GenerationConfig,
Image,
)
image_bytes = _get_image_bytes_from_url(image_url)
return Image.from_bytes(data=image_bytes)
def _convert_gemini_role(role: str) -> Literal["user", "model"]:
if role == "user":
return "user"
@ -322,28 +307,9 @@ def _process_gemini_image(image_url: str) -> PartType:
return PartType(file_data=file_data)
# Direct links
elif "https:/" in image_url:
image = _load_image_from_url(image_url)
_blob = BlobType(data=image.data, mime_type=image._mime_type)
return PartType(inline_data=_blob)
# Base64 encoding
elif "base64" in image_url:
import base64, re
# base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
image_metadata, img_without_base_64 = image_url.split(",")
# read mime_type from img_without_base_64=data:image/jpeg;base64
# Extract MIME type using regular expression
mime_type_match = re.match(r"data:(.*?);base64", image_metadata)
if mime_type_match:
mime_type = mime_type_match.group(1)
else:
mime_type = "image/jpeg"
decoded_img = base64.b64decode(img_without_base_64)
_blob = BlobType(data=decoded_img, mime_type=mime_type)
elif "https:/" in image_url or "base64" in image_url:
image = convert_to_anthropic_image_obj(image_url)
_blob = BlobType(data=image["data"], mime_type=image["media_type"])
return PartType(inline_data=_blob)
raise Exception("Invalid image received - {}".format(image_url))
except Exception as e:
@ -371,7 +337,7 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
_parts: List[PartType] = []
for element in messages[msg_i]["content"]:
if isinstance(element, dict):
if element["type"] == "text":
if element["type"] == "text" and len(element["text"]) > 0:
_part = PartType(text=element["text"])
_parts.append(_part)
elif element["type"] == "image_url":
@ -379,7 +345,10 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
_part = _process_gemini_image(image_url=image_url)
_parts.append(_part) # type: ignore
user_content.extend(_parts)
else:
elif (
isinstance(messages[msg_i]["content"], str)
and len(messages[msg_i]["content"]) > 0
):
_part = PartType(text=messages[msg_i]["content"])
user_content.append(_part)
@ -479,23 +448,25 @@ def completion(
message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
)
try:
import google.auth # type: ignore
import proto # type: ignore
from google.cloud import aiplatform # type: ignore
from google.cloud.aiplatform_v1beta1.types import (
content as gapic_content_types, # type: ignore
)
from google.protobuf import json_format # type: ignore
from google.protobuf.struct_pb2 import Value # type: ignore
from vertexai.language_models import CodeGenerationModel, TextGenerationModel
from vertexai.preview.generative_models import (
GenerationConfig,
GenerativeModel,
Part,
)
from vertexai.preview.language_models import (
ChatModel,
CodeChatModel,
InputOutputTextPair,
)
from vertexai.language_models import TextGenerationModel, CodeGenerationModel
from vertexai.preview.generative_models import (
GenerativeModel,
Part,
GenerationConfig,
)
from google.cloud import aiplatform # type: ignore
from google.protobuf import json_format # type: ignore
from google.protobuf.struct_pb2 import Value # type: ignore
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore
import google.auth # type: ignore
import proto # type: ignore
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
print_verbose(
@ -617,7 +588,7 @@ def completion(
llm_model = None
# NOTE: async prediction and streaming under "private" mode isn't supported by aiplatform right now
if acompletion == True:
if acompletion is True:
data = {
"llm_model": llm_model,
"mode": mode,
@ -649,7 +620,7 @@ def completion(
tools = optional_params.pop("tools", None)
content = _gemini_convert_messages_with_history(messages=messages)
stream = optional_params.pop("stream", False)
if stream == True:
if stream is True:
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
logging_obj.pre_call(
input=prompt,
@ -1411,8 +1382,8 @@ def embedding(
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
)
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
import google.auth # type: ignore
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
try:

View file

@ -6,7 +6,8 @@ from enum import Enum
import requests, copy # type: ignore
import time, uuid
from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
from litellm.litellm_core_utils.core_helpers import map_finish_reason
import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from .prompt_templates.factory import (
@ -237,7 +238,10 @@ def completion(
if vertex_credentials is not None and isinstance(vertex_credentials, str):
import google.oauth2.service_account
json_obj = json.loads(vertex_credentials)
try:
json_obj = json.loads(vertex_credentials)
except json.JSONDecodeError:
json_obj = json.load(open(vertex_credentials))
creds = (
google.oauth2.service_account.Credentials.from_service_account_info(

File diff suppressed because it is too large Load diff

View file

@ -7,107 +7,132 @@
#
# Thank you ! We ❤️ you! - Krrish & Ishaan
import os, openai, sys, json, inspect, uuid, datetime, threading
from typing import Any, Literal, Union, BinaryIO
from typing_extensions import overload
from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars
import asyncio
import contextvars
import datetime
import inspect
import json
import os
import random
import sys
import threading
import time
import traceback
import uuid
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
from functools import partial
from typing import (
Any,
BinaryIO,
Callable,
Dict,
List,
Literal,
Mapping,
Optional,
Union,
)
import dotenv
import httpx
import openai
import tiktoken
from typing_extensions import overload
import litellm
from ._logging import verbose_logger
from litellm import ( # type: ignore
Logging,
client,
exception_type,
get_optional_params,
get_litellm_params,
Logging,
get_optional_params,
)
from litellm.utils import (
get_secret,
CustomStreamWrapper,
read_config_args,
completion_with_fallbacks,
get_llm_provider,
get_api_key,
mock_completion_streaming_obj,
Usage,
async_mock_completion_streaming_obj,
completion_with_fallbacks,
convert_to_model_response_object,
token_counter,
create_pretrained_tokenizer,
create_tokenizer,
Usage,
get_api_key,
get_llm_provider,
get_optional_params_embeddings,
get_optional_params_image_gen,
get_secret,
mock_completion_streaming_obj,
read_config_args,
supports_httpx_timeout,
token_counter,
)
from ._logging import verbose_logger
from .caching import disable_cache, enable_cache, update_cache
from .llms import (
anthropic_text,
together_ai,
ai21,
sagemaker,
bedrock,
triton,
huggingface_restapi,
replicate,
aleph_alpha,
nlp_cloud,
anthropic_text,
baseten,
vllm,
ollama,
ollama_chat,
cloudflare,
bedrock,
clarifai,
cloudflare,
cohere,
cohere_chat,
petals,
gemini,
huggingface_restapi,
maritalk,
nlp_cloud,
ollama,
ollama_chat,
oobabooga,
openrouter,
palm,
gemini,
petals,
replicate,
sagemaker,
together_ai,
triton,
vertex_ai,
vertex_ai_anthropic,
maritalk,
vllm,
watsonx,
)
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
from .llms.azure import AzureChatCompletion
from .llms.databricks import DatabricksChatCompletion
from .llms.azure_text import AzureTextCompletion
from .llms.anthropic import AnthropicChatCompletion
from .llms.anthropic_text import AnthropicTextCompletion
from .llms.azure import AzureChatCompletion
from .llms.azure_text import AzureTextCompletion
from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
from .llms.databricks import DatabricksChatCompletion
from .llms.huggingface_restapi import Huggingface
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
from .llms.predibase import PredibaseChatCompletion
from .llms.bedrock_httpx import BedrockLLM, BedrockConverseLLM
from .llms.vertex_httpx import VertexLLM
from .llms.triton import TritonChatCompletion
from .llms.prompt_templates.factory import (
prompt_factory,
custom_prompt,
function_call_prompt,
map_system_message_pt,
prompt_factory,
)
import tiktoken
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, List, Optional, Dict, Union, Mapping
from .caching import enable_cache, disable_cache, update_cache
from .llms.text_completion_codestral import CodestralTextCompletion
from .llms.triton import TritonChatCompletion
from .llms.vertex_httpx import VertexLLM
from .types.llms.openai import HttpxBinaryResponseContent
from .types.utils import ChatCompletionMessageToolCall
encoding = tiktoken.get_encoding("cl100k_base")
from litellm.utils import (
get_secret,
Choices,
CustomStreamWrapper,
TextCompletionStreamWrapper,
ModelResponse,
TextCompletionResponse,
TextChoices,
EmbeddingResponse,
ImageResponse,
read_config_args,
Choices,
Message,
ModelResponse,
TextChoices,
TextCompletionResponse,
TextCompletionStreamWrapper,
TranscriptionResponse,
get_secret,
read_config_args,
)
####### ENVIRONMENT VARIABLES ###################
@ -120,6 +145,7 @@ azure_chat_completions = AzureChatCompletion()
azure_text_completions = AzureTextCompletion()
huggingface = Huggingface()
predibase_chat_completions = PredibaseChatCompletion()
codestral_text_completions = CodestralTextCompletion()
triton_chat_completions = TritonChatCompletion()
bedrock_chat_completion = BedrockLLM()
bedrock_converse_chat_completion = BedrockConverseLLM()
@ -322,6 +348,8 @@ async def acompletion(
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
or custom_llm_provider == "codestral"
or custom_llm_provider == "text-completion-codestral"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "huggingface"
@ -329,6 +357,7 @@ async def acompletion(
or custom_llm_provider == "ollama_chat"
or custom_llm_provider == "replicate"
or custom_llm_provider == "vertex_ai"
or custom_llm_provider == "vertex_ai_beta"
or custom_llm_provider == "gemini"
or custom_llm_provider == "sagemaker"
or custom_llm_provider == "anthropic"
@ -350,9 +379,10 @@ async def acompletion(
else:
response = init_response # type: ignore
if custom_llm_provider == "text-completion-openai" and isinstance(
response, TextCompletionResponse
):
if (
custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "text-completion-codestral"
) and isinstance(response, TextCompletionResponse):
response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
response_object=response,
model_response_object=litellm.ModelResponse(),
@ -367,7 +397,9 @@ async def acompletion(
return response
except Exception as e:
verbose_logger.error(
"litellm.acompletion(): Exception occured - {}".format(str(e))
"litellm.acompletion(): Exception occured - {}\n{}".format(
str(e), traceback.format_exc()
)
)
verbose_logger.debug(traceback.format_exc())
custom_llm_provider = custom_llm_provider or "openai"
@ -397,7 +429,9 @@ def mock_completion(
messages: List,
stream: Optional[bool] = False,
mock_response: Union[str, Exception] = "This is a mock request",
mock_tool_calls: Optional[List] = None,
logging=None,
custom_llm_provider=None,
**kwargs,
):
"""
@ -435,7 +469,7 @@ def mock_completion(
raise litellm.APIError(
status_code=getattr(mock_response, "status_code", 500), # type: ignore
message=getattr(mock_response, "text", str(mock_response)),
llm_provider=getattr(mock_response, "llm_provider", "openai"), # type: ignore
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
)
@ -464,6 +498,12 @@ def mock_completion(
model_response["created"] = int(time.time())
model_response["model"] = model
if mock_tool_calls:
model_response["choices"][0]["message"]["tool_calls"] = [
ChatCompletionMessageToolCall(**tool_call)
for tool_call in mock_tool_calls
]
setattr(
model_response,
"usage",
@ -577,6 +617,7 @@ def completion(
args = locals()
api_base = kwargs.get("api_base", None)
mock_response = kwargs.get("mock_response", None)
mock_tool_calls = kwargs.get("mock_tool_calls", None)
force_timeout = kwargs.get("force_timeout", 600) ## deprecated
logger_fn = kwargs.get("logger_fn", None)
verbose = kwargs.get("verbose", False)
@ -895,15 +936,17 @@ def completion(
litellm_params=litellm_params,
custom_llm_provider=custom_llm_provider,
)
if mock_response:
if mock_response or mock_tool_calls:
return mock_completion(
model,
messages,
stream=stream,
mock_response=mock_response,
mock_tool_calls=mock_tool_calls,
logging=logging,
acompletion=acompletion,
mock_delay=kwargs.get("mock_delay", None),
custom_llm_provider=custom_llm_provider,
)
if custom_llm_provider == "azure":
# azure configs
@ -1035,91 +1078,6 @@ def completion(
"api_base": api_base,
},
)
elif (
model in litellm.open_ai_chat_completion_models
or custom_llm_provider == "custom_openai"
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "anyscale"
or custom_llm_provider == "mistral"
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
or custom_llm_provider in litellm.openai_compatible_providers
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works
# allow for the setting of dynamic and stateful api-bases
api_base = (
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or get_secret("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
openai.organization = (
organization
or litellm.organization
or get_secret("OPENAI_ORGANIZATION")
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
api_key
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or get_secret("OPENAI_API_KEY")
)
headers = headers or litellm.headers
## LOAD CONFIG - if set
config = litellm.OpenAIConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
): # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
## COMPLETION CALL
try:
response = openai_chat_completions.completion(
model=model,
messages=messages,
headers=headers,
model_response=model_response,
print_verbose=print_verbose,
api_key=api_key,
api_base=api_base,
acompletion=acompletion,
logging_obj=logging,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
timeout=timeout, # type: ignore
custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
custom_llm_provider=custom_llm_provider,
)
except Exception as e:
## LOGGING - log the original exception returned
logging.post_call(
input=messages,
api_key=api_key,
original_response=str(e),
additional_args={"headers": headers},
)
raise e
if optional_params.get("stream", False):
## LOGGING
logging.post_call(
input=messages,
api_key=api_key,
original_response=response,
additional_args={"headers": headers},
)
elif (
custom_llm_provider == "text-completion-openai"
or "ft:babbage-002" in model
@ -1203,6 +1161,93 @@ def completion(
additional_args={"headers": headers},
)
response = _response
elif (
model in litellm.open_ai_chat_completion_models
or custom_llm_provider == "custom_openai"
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
or custom_llm_provider == "codestral"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "anyscale"
or custom_llm_provider == "mistral"
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
or custom_llm_provider in litellm.openai_compatible_providers
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works
# allow for the setting of dynamic and stateful api-bases
api_base = (
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or get_secret("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
openai.organization = (
organization
or litellm.organization
or get_secret("OPENAI_ORGANIZATION")
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
api_key
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or get_secret("OPENAI_API_KEY")
)
headers = headers or litellm.headers
## LOAD CONFIG - if set
config = litellm.OpenAIConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
): # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
## COMPLETION CALL
try:
response = openai_chat_completions.completion(
model=model,
messages=messages,
headers=headers,
model_response=model_response,
print_verbose=print_verbose,
api_key=api_key,
api_base=api_base,
acompletion=acompletion,
logging_obj=logging,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
timeout=timeout, # type: ignore
custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
custom_llm_provider=custom_llm_provider,
)
except Exception as e:
## LOGGING - log the original exception returned
logging.post_call(
input=messages,
api_key=api_key,
original_response=str(e),
additional_args={"headers": headers},
)
raise e
if optional_params.get("stream", False):
## LOGGING
logging.post_call(
input=messages,
api_key=api_key,
original_response=response,
additional_args={"headers": headers},
)
elif (
"replicate" in model
or custom_llm_provider == "replicate"
@ -1840,7 +1885,25 @@ def completion(
)
return response
response = model_response
elif custom_llm_provider == "gemini":
elif custom_llm_provider == "vertex_ai_beta" or custom_llm_provider == "gemini":
vertex_ai_project = (
optional_params.pop("vertex_project", None)
or optional_params.pop("vertex_ai_project", None)
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_location", None)
or optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
vertex_credentials = (
optional_params.pop("vertex_credentials", None)
or optional_params.pop("vertex_ai_credentials", None)
or get_secret("VERTEXAI_CREDENTIALS")
)
gemini_api_key = (
api_key
or get_secret("GEMINI_API_KEY")
@ -1848,34 +1911,28 @@ def completion(
or litellm.api_key
)
# palm does not support streaming as yet :(
model_response = gemini.completion(
new_params = deepcopy(optional_params)
response = vertex_chat_completion.completion( # type: ignore
model=model,
messages=messages,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
optional_params=new_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
api_key=gemini_api_key,
vertex_location=vertex_ai_location,
vertex_project=vertex_ai_project,
vertex_credentials=vertex_credentials,
gemini_api_key=gemini_api_key,
logging_obj=logging,
acompletion=acompletion,
custom_prompt_dict=custom_prompt_dict,
timeout=timeout,
custom_llm_provider=custom_llm_provider,
client=client,
api_base=api_base,
)
if (
"stream" in optional_params
and optional_params["stream"] == True
and acompletion == False
):
response = CustomStreamWrapper(
iter(model_response),
model,
custom_llm_provider="gemini",
logging_obj=logging,
)
return response
response = model_response
elif custom_llm_provider == "vertex_ai":
vertex_ai_project = (
optional_params.pop("vertex_project", None)
@ -1894,6 +1951,7 @@ def completion(
or optional_params.pop("vertex_ai_credentials", None)
or get_secret("VERTEXAI_CREDENTIALS")
)
new_params = deepcopy(optional_params)
if "claude-3" in model:
model_response = vertex_ai_anthropic.completion(
@ -1982,6 +2040,46 @@ def completion(
timeout=timeout,
)
if (
"stream" in optional_params
and optional_params["stream"] is True
and acompletion is False
):
return _model_response
response = _model_response
elif custom_llm_provider == "text-completion-codestral":
api_base = (
api_base
or optional_params.pop("api_base", None)
or optional_params.pop("base_url", None)
or litellm.api_base
or "https://codestral.mistral.ai/v1/fim/completions"
)
api_key = api_key or litellm.api_key or get_secret("CODESTRAL_API_KEY")
text_completion_model_response = litellm.TextCompletionResponse(
stream=stream
)
_model_response = codestral_text_completions.completion( # type: ignore
model=model,
messages=messages,
model_response=text_completion_model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
logging_obj=logging,
acompletion=acompletion,
api_base=api_base,
custom_prompt_dict=custom_prompt_dict,
api_key=api_key,
timeout=timeout,
)
if (
"stream" in optional_params
and optional_params["stream"] is True
@ -3371,7 +3469,9 @@ def embedding(
###### Text Completion ################
@client
async def atext_completion(*args, **kwargs):
async def atext_completion(
*args, **kwargs
) -> Union[TextCompletionResponse, TextCompletionStreamWrapper]:
"""
Implemented to handle async streaming for the text completion endpoint
"""
@ -3403,6 +3503,7 @@ async def atext_completion(*args, **kwargs):
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
or custom_llm_provider == "text-completion-codestral"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "text-completion-openai"
@ -3664,6 +3765,7 @@ def text_completion(
custom_llm_provider == "openai"
or custom_llm_provider == "azure"
or custom_llm_provider == "azure_text"
or custom_llm_provider == "text-completion-codestral"
or custom_llm_provider == "text-completion-openai"
)
and isinstance(prompt, list)
@ -3680,6 +3782,12 @@ def text_completion(
)
kwargs.pop("prompt", None)
if model is not None and model.startswith(
"openai/"
): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
model = model.replace("openai/", "text-completion-openai/")
kwargs["text_completion"] = True
response = completion(
model=model,
@ -3842,6 +3950,7 @@ def image_generation(
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", {})
client = kwargs.get("client", None)
model_response = litellm.utils.ImageResponse()
if model is not None or custom_llm_provider is not None:
@ -3980,6 +4089,7 @@ def image_generation(
model_response=model_response,
api_version=api_version,
aimg_generation=aimg_generation,
client=client,
)
elif custom_llm_provider == "openai":
model_response = openai_chat_completions.image_generation(
@ -3992,6 +4102,7 @@ def image_generation(
optional_params=optional_params,
model_response=model_response,
aimg_generation=aimg_generation,
client=client,
)
elif custom_llm_provider == "bedrock":
if model is None:

View file

@ -234,6 +234,30 @@
"litellm_provider": "openai",
"mode": "chat"
},
"ft:gpt-4-0613": {
"max_tokens": 4096,
"max_input_tokens": 8192,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00003,
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
},
"ft:gpt-4o-2024-05-13": {
"max_tokens": 4096,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000005,
"output_cost_per_token": 0.000015,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true,
"source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
},
"ft:davinci-002": {
"max_tokens": 16384,
"max_input_tokens": 16384,
@ -499,7 +523,7 @@
"max_tokens": 4096,
"max_input_tokens": 16384,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000015,
"input_cost_per_token": 0.000001,
"output_cost_per_token": 0.000002,
"litellm_provider": "azure",
"mode": "chat",
@ -841,7 +865,7 @@
},
"deepseek-coder": {
"max_tokens": 4096,
"max_input_tokens": 16000,
"max_input_tokens": 32000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000014,
"output_cost_per_token": 0.00000028,
@ -862,7 +886,7 @@
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000010,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000010,
"litellm_provider": "groq",
"mode": "chat",
@ -872,8 +896,8 @@
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000064,
"output_cost_per_token": 0.00000080,
"input_cost_per_token": 0.00000059,
"output_cost_per_token": 0.00000079,
"litellm_provider": "groq",
"mode": "chat",
"supports_function_calling": true
@ -991,6 +1015,18 @@
"supports_vision": true,
"tool_use_system_prompt_tokens": 159
},
"claude-3-5-sonnet-20240620": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"tool_use_system_prompt_tokens": 159
},
"text-bison": {
"max_tokens": 1024,
"max_input_tokens": 8192,
@ -1155,30 +1191,42 @@
"max_tokens": 8192,
"max_input_tokens": 32760,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.0000005,
"input_cost_per_image": 0.0025,
"input_cost_per_video_per_second": 0.002,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
},
"gemini-1.0-pro": {
"max_tokens": 8192,
"max_input_tokens": 32760,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.0000005,
"input_cost_per_image": 0.0025,
"input_cost_per_video_per_second": 0.002,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
},
"gemini-1.0-pro-001": {
"max_tokens": 8192,
"max_input_tokens": 32760,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.0000005,
"input_cost_per_image": 0.0025,
"input_cost_per_video_per_second": 0.002,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
@ -1188,8 +1236,12 @@
"max_tokens": 8192,
"max_input_tokens": 32760,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.0000005,
"input_cost_per_image": 0.0025,
"input_cost_per_video_per_second": 0.002,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
@ -1199,14 +1251,157 @@
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000000625,
"output_cost_per_token": 0.000001875,
"input_cost_per_image": 0.001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_video_per_second": 0.001315,
"input_cost_per_token": 0.000005,
"input_cost_per_character": 0.00000125,
"input_cost_per_token_above_128k_tokens": 0.00001,
"input_cost_per_character_above_128k_tokens": 0.0000025,
"output_cost_per_token": 0.000015,
"output_cost_per_character": 0.00000375,
"output_cost_per_token_above_128k_tokens": 0.00003,
"output_cost_per_character_above_128k_tokens": 0.0000075,
"output_cost_per_image": 0.00263,
"output_cost_per_video_per_second": 0.00263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-001": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_image": 0.001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_video_per_second": 0.001315,
"input_cost_per_token": 0.000005,
"input_cost_per_character": 0.00000125,
"input_cost_per_token_above_128k_tokens": 0.00001,
"input_cost_per_character_above_128k_tokens": 0.0000025,
"output_cost_per_token": 0.000015,
"output_cost_per_character": 0.00000375,
"output_cost_per_token_above_128k_tokens": 0.00003,
"output_cost_per_character_above_128k_tokens": 0.0000075,
"output_cost_per_image": 0.00263,
"output_cost_per_video_per_second": 0.00263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0514": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_image": 0.001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_video_per_second": 0.001315,
"input_cost_per_token": 0.000005,
"input_cost_per_character": 0.00000125,
"input_cost_per_token_above_128k_tokens": 0.00001,
"input_cost_per_character_above_128k_tokens": 0.0000025,
"output_cost_per_token": 0.000015,
"output_cost_per_character": 0.00000375,
"output_cost_per_token_above_128k_tokens": 0.00003,
"output_cost_per_character_above_128k_tokens": 0.0000075,
"output_cost_per_image": 0.00263,
"output_cost_per_video_per_second": 0.00263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0215": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_image": 0.001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_video_per_second": 0.001315,
"input_cost_per_token": 0.000005,
"input_cost_per_character": 0.00000125,
"input_cost_per_token_above_128k_tokens": 0.00001,
"input_cost_per_character_above_128k_tokens": 0.0000025,
"output_cost_per_token": 0.000015,
"output_cost_per_character": 0.00000375,
"output_cost_per_token_above_128k_tokens": 0.00003,
"output_cost_per_character_above_128k_tokens": 0.0000075,
"output_cost_per_image": 0.00263,
"output_cost_per_video_per_second": 0.00263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0409": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_image": 0.001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_video_per_second": 0.001315,
"input_cost_per_token": 0.000005,
"input_cost_per_character": 0.00000125,
"input_cost_per_token_above_128k_tokens": 0.00001,
"input_cost_per_character_above_128k_tokens": 0.0000025,
"output_cost_per_token": 0.000015,
"output_cost_per_character": 0.00000375,
"output_cost_per_token_above_128k_tokens": 0.00003,
"output_cost_per_character_above_128k_tokens": 0.0000075,
"output_cost_per_image": 0.00263,
"output_cost_per_video_per_second": 0.00263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-flash": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"max_images_per_prompt": 3000,
"max_videos_per_prompt": 10,
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_image": 0.0001315,
"input_cost_per_video_per_second": 0.0001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"input_cost_per_token_above_128k_tokens": 0.000001,
"input_cost_per_character_above_128k_tokens": 0.00000025,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"output_cost_per_token_above_128k_tokens": 0.000003,
"output_cost_per_character_above_128k_tokens": 0.00000075,
"output_cost_per_image": 0.000263,
"output_cost_per_video_per_second": 0.000263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-flash-001": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
@ -1217,10 +1412,23 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"input_cost_per_image": 0.0001315,
"input_cost_per_video_per_second": 0.0001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"input_cost_per_token_above_128k_tokens": 0.000001,
"input_cost_per_character_above_128k_tokens": 0.00000025,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"output_cost_per_token_above_128k_tokens": 0.000003,
"output_cost_per_character_above_128k_tokens": 0.00000075,
"output_cost_per_image": 0.000263,
"output_cost_per_video_per_second": 0.000263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
@ -1235,62 +1443,27 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"input_cost_per_image": 0.0001315,
"input_cost_per_video_per_second": 0.0001315,
"input_cost_per_audio_per_second": 0.000125,
"input_cost_per_token": 0.0000005,
"input_cost_per_character": 0.000000125,
"input_cost_per_token_above_128k_tokens": 0.000001,
"input_cost_per_character_above_128k_tokens": 0.00000025,
"output_cost_per_token": 0.0000015,
"output_cost_per_character": 0.000000375,
"output_cost_per_token_above_128k_tokens": 0.000003,
"output_cost_per_character_above_128k_tokens": 0.00000075,
"output_cost_per_image": 0.000263,
"output_cost_per_video_per_second": 0.000263,
"output_cost_per_audio_per_second": 0.00025,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-001": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000000625,
"output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0514": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000000625,
"output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0215": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000000625,
"output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-1.5-pro-preview-0409": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000000625,
"output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_function_calling": true,
"supports_tool_choice": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini-experimental": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
@ -1359,6 +1532,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"vertex_ai/claude-3-5-sonnet@20240620": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"vertex_ai/claude-3-haiku@20240307": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -1538,6 +1722,27 @@
"mode": "completion",
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini/gemini-1.5-flash": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"max_images_per_prompt": 3000,
"max_videos_per_prompt": 10,
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
"gemini/gemini-1.5-flash-latest": {
"max_tokens": 8192,
"max_input_tokens": 1000000,
@ -1547,11 +1752,14 @@
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
@ -1560,8 +1768,10 @@
"max_tokens": 8192,
"max_input_tokens": 32760,
"max_output_tokens": 8192,
"input_cost_per_token": 0.0,
"output_cost_per_token": 0.0,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_function_calling": true,
@ -1571,10 +1781,13 @@
"max_tokens": 8192,
"max_input_tokens": 1000000,
"max_output_tokens": 8192,
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_tool_choice": true,
@ -1584,10 +1797,13 @@
"max_tokens": 8192,
"max_input_tokens": 1048576,
"max_output_tokens": 8192,
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_tool_choice": true,
@ -1597,8 +1813,10 @@
"max_tokens": 2048,
"max_input_tokens": 30720,
"max_output_tokens": 2048,
"input_cost_per_token": 0.0,
"output_cost_per_token": 0.0,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"litellm_provider": "gemini",
"mode": "chat",
"supports_function_calling": true,
@ -1796,6 +2014,15 @@
"litellm_provider": "replicate",
"mode": "chat"
},
"openrouter/deepseek/deepseek-coder": {
"max_tokens": 4096,
"max_input_tokens": 32000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000014,
"output_cost_per_token": 0.00000028,
"litellm_provider": "openrouter",
"mode": "chat"
},
"openrouter/microsoft/wizardlm-2-8x22b:nitro": {
"max_tokens": 65536,
"input_cost_per_token": 0.000001,
@ -2349,6 +2576,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-5-sonnet-20240620-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -3377,6 +3615,24 @@
"litellm_provider": "deepinfra",
"mode": "chat"
},
"deepinfra/meta-llama/Meta-Llama-3-8B-Instruct": {
"max_tokens": 8191,
"max_input_tokens": 8191,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000008,
"output_cost_per_token": 0.00000008,
"litellm_provider": "deepinfra",
"mode": "chat"
},
"deepinfra/meta-llama/Meta-Llama-3-70B-Instruct": {
"max_tokens": 8191,
"max_input_tokens": 8191,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000059,
"output_cost_per_token": 0.00000079,
"litellm_provider": "deepinfra",
"mode": "chat"
},
"deepinfra/01-ai/Yi-34B-200K": {
"max_tokens": 4096,
"max_input_tokens": 200000,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);

Some files were not shown because too many files have changed in this diff Show more