forked from phoenix/litellm-mirror
Merge branch 'main' into feat/friendliai
This commit is contained in:
commit
c4c7d1b367
201 changed files with 22438 additions and 13694 deletions
|
@ -65,6 +65,7 @@ jobs:
|
|||
pip install "pydantic==2.7.1"
|
||||
pip install "diskcache==5.6.1"
|
||||
pip install "Pillow==10.3.0"
|
||||
pip install "ijson==3.2.3"
|
||||
- save_cache:
|
||||
paths:
|
||||
- ./venv
|
||||
|
@ -126,6 +127,7 @@ jobs:
|
|||
pip install jinja2
|
||||
pip install tokenizers
|
||||
pip install openai
|
||||
pip install ijson
|
||||
- run:
|
||||
name: Run tests
|
||||
command: |
|
||||
|
@ -180,6 +182,7 @@ jobs:
|
|||
pip install numpydoc
|
||||
pip install prisma
|
||||
pip install fastapi
|
||||
pip install ijson
|
||||
pip install "httpx==0.24.1"
|
||||
pip install "gunicorn==21.2.0"
|
||||
pip install "anyio==3.7.1"
|
||||
|
|
10
.github/dependabot.yaml
vendored
Normal file
10
.github/dependabot.yaml
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
groups:
|
||||
github-actions:
|
||||
patterns:
|
||||
- "*"
|
22
.github/workflows/ghcr_deploy.yml
vendored
22
.github/workflows/ghcr_deploy.yml
vendored
|
@ -25,6 +25,11 @@ jobs:
|
|||
if: github.repository == 'BerriAI/litellm'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
-
|
||||
name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_hash }}
|
||||
-
|
||||
name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
@ -41,12 +46,14 @@ jobs:
|
|||
name: Build and push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }}
|
||||
-
|
||||
name: Build and push litellm-database image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
file: Dockerfile.database
|
||||
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
|
||||
|
@ -54,6 +61,7 @@ jobs:
|
|||
name: Build and push litellm-spend-logs image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
file: ./litellm-js/spend-logs/Dockerfile
|
||||
tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
|
||||
|
@ -68,6 +76,8 @@ jobs:
|
|||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_hash }}
|
||||
# Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
|
@ -92,7 +102,7 @@ jobs:
|
|||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8
|
||||
with:
|
||||
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
|
@ -106,6 +116,8 @@ jobs:
|
|||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_hash }}
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
|
@ -128,7 +140,7 @@ jobs:
|
|||
- name: Build and push Database Docker image
|
||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||
with:
|
||||
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
|
||||
context: .
|
||||
file: Dockerfile.database
|
||||
push: true
|
||||
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }}
|
||||
|
@ -143,6 +155,8 @@ jobs:
|
|||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_hash }}
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
|
@ -165,7 +179,7 @@ jobs:
|
|||
- name: Build and push Database Docker image
|
||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||
with:
|
||||
context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
|
||||
context: .
|
||||
file: ./litellm-js/spend-logs/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
|
||||
|
@ -176,6 +190,8 @@ jobs:
|
|||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_hash }}
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -60,3 +60,4 @@ litellm/proxy/_experimental/out/404/index.html
|
|||
litellm/proxy/_experimental/out/model_hub/index.html
|
||||
litellm/proxy/_experimental/out/onboarding/index.html
|
||||
litellm/tests/log.txt
|
||||
litellm/tests/langfuse.log
|
||||
|
|
|
@ -1,4 +1,19 @@
|
|||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy
|
||||
name: mypy
|
||||
entry: python3 -m mypy --ignore-missing-imports
|
||||
language: system
|
||||
types: [python]
|
||||
files: ^litellm/
|
||||
- id: isort
|
||||
name: isort
|
||||
entry: isort
|
||||
language: system
|
||||
types: [python]
|
||||
files: litellm/.*\.py
|
||||
exclude: ^litellm/__init__.py$
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.2.0
|
||||
hooks:
|
||||
|
@ -16,11 +31,10 @@ repos:
|
|||
name: Check if files match
|
||||
entry: python3 ci_cd/check_files_match.py
|
||||
language: system
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy
|
||||
name: mypy
|
||||
entry: python3 -m mypy --ignore-missing-imports
|
||||
language: system
|
||||
types: [python]
|
||||
files: ^litellm/
|
||||
# - id: check-file-length
|
||||
# name: Check file length
|
||||
# entry: python check_file_length.py
|
||||
# args: ["10000"] # set your desired maximum number of lines
|
||||
# language: python
|
||||
# files: litellm/.*\.py
|
||||
# exclude: ^litellm/tests/
|
28
check_file_length.py
Normal file
28
check_file_length.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
import sys
|
||||
|
||||
|
||||
def check_file_length(max_lines, filenames):
|
||||
bad_files = []
|
||||
for filename in filenames:
|
||||
with open(filename, "r") as file:
|
||||
lines = file.readlines()
|
||||
if len(lines) > max_lines:
|
||||
bad_files.append((filename, len(lines)))
|
||||
return bad_files
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
max_lines = int(sys.argv[1])
|
||||
filenames = sys.argv[2:]
|
||||
|
||||
bad_files = check_file_length(max_lines, filenames)
|
||||
if bad_files:
|
||||
bad_files.sort(
|
||||
key=lambda x: x[1], reverse=True
|
||||
) # Sort files by length in descending order
|
||||
for filename, length in bad_files:
|
||||
print(f"{filename}: {length} lines")
|
||||
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
110
docs/my-website/docs/completion/drop_params.md
Normal file
110
docs/my-website/docs/completion/drop_params.md
Normal file
|
@ -0,0 +1,110 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Drop Unsupported Params
|
||||
|
||||
Drop unsupported OpenAI params by your LLM Provider.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import os
|
||||
|
||||
# set keys
|
||||
os.environ["COHERE_API_KEY"] = "co-.."
|
||||
|
||||
litellm.drop_params = True # 👈 KEY CHANGE
|
||||
|
||||
response = litellm.completion(
|
||||
model="command-r",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
response_format={"key": "value"},
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
LiteLLM maps all supported openai params by provider + model (e.g. function calling is supported by anthropic on bedrock but not titan).
|
||||
|
||||
See `litellm.get_supported_openai_params("command-r")` [**Code**](https://github.com/BerriAI/litellm/blob/main/litellm/utils.py#L3584)
|
||||
|
||||
If a provider/model doesn't support a particular param, you can drop it.
|
||||
|
||||
## OpenAI Proxy Usage
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
drop_params: true
|
||||
```
|
||||
|
||||
## Pass drop_params in `completion(..)`
|
||||
|
||||
Just drop_params when calling specific models
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import os
|
||||
|
||||
# set keys
|
||||
os.environ["COHERE_API_KEY"] = "co-.."
|
||||
|
||||
response = litellm.completion(
|
||||
model="command-r",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
response_format={"key": "value"},
|
||||
drop_params=True
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
- litellm_params:
|
||||
api_base: my-base
|
||||
model: openai/my-model
|
||||
drop_params: true # 👈 KEY CHANGE
|
||||
model_name: my-model
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Specify params to drop
|
||||
|
||||
To drop specific params when calling a provider (E.g. 'logit_bias' for vllm)
|
||||
|
||||
Use `additional_drop_params`
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
import litellm
|
||||
import os
|
||||
|
||||
# set keys
|
||||
os.environ["COHERE_API_KEY"] = "co-.."
|
||||
|
||||
response = litellm.completion(
|
||||
model="command-r",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
response_format={"key": "value"},
|
||||
additional_drop_params=["response_format"]
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
```yaml
|
||||
- litellm_params:
|
||||
api_base: my-base
|
||||
model: openai/my-model
|
||||
additional_drop_params: ["response_format"] # 👈 KEY CHANGE
|
||||
model_name: my-model
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
|
|
@ -67,6 +67,10 @@ By default, LiteLLM raises an exception if the openai param being passed in isn'
|
|||
|
||||
To drop the param instead, set `litellm.drop_params = True` or `completion(..drop_params=True)`.
|
||||
|
||||
This **ONLY DROPS UNSUPPORTED OPENAI PARAMS**.
|
||||
|
||||
LiteLLM assumes any non-openai param is provider specific and passes it in as a kwarg in the request body
|
||||
|
||||
:::
|
||||
|
||||
## Input Params
|
||||
|
@ -162,7 +166,7 @@ def completion(
|
|||
|
||||
- `function`: *object* - Required.
|
||||
|
||||
- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via {"type: "function", "function": {"name": "my_function"}} forces the model to call that function.
|
||||
- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via `{"type: "function", "function": {"name": "my_function"}}` forces the model to call that function.
|
||||
|
||||
- `none` is the default when no functions are present. `auto` is the default if functions are present.
|
||||
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
import QueryParamReader from '../../src/components/queryParamReader.js'
|
||||
|
||||
# [Beta] Monitor Logs in Production
|
||||
|
||||
:::note
|
||||
|
||||
This is in beta. Expect frequent updates, as we improve based on your feedback.
|
||||
|
||||
:::
|
||||
|
||||
LiteLLM provides an integration to let you monitor logs in production.
|
||||
|
||||
👉 Jump to our sample LiteLLM Dashboard: https://admin.litellm.ai/
|
||||
|
||||
|
||||
<Image img={require('../../img/alt_dashboard.png')} alt="Dashboard" />
|
||||
|
||||
## Debug your first logs
|
||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_OpenAI.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
</a>
|
||||
|
||||
|
||||
### 1. Get your LiteLLM Token
|
||||
|
||||
Go to [admin.litellm.ai](https://admin.litellm.ai/) and copy the code snippet with your unique token
|
||||
|
||||
<Image img={require('../../img/hosted_debugger_usage_page.png')} alt="Usage" />
|
||||
|
||||
### 2. Set up your environment
|
||||
|
||||
**Add it to your .env**
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.env["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
|
||||
|
||||
```
|
||||
|
||||
**Turn on LiteLLM Client**
|
||||
```python
|
||||
import litellm
|
||||
litellm.client = True
|
||||
```
|
||||
|
||||
### 3. Make a normal `completion()` call
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
# set env variables
|
||||
os.environ["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
|
||||
os.environ["OPENAI_API_KEY"] = "openai key"
|
||||
|
||||
litellm.use_client = True # enable logging dashboard
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
|
||||
# openai call
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages)
|
||||
```
|
||||
|
||||
Your `completion()` call print with a link to your session dashboard (https://admin.litellm.ai/<your_unique_token>)
|
||||
|
||||
In the above case it would be: [`admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb`](https://admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb)
|
||||
|
||||
Click on your personal dashboard link. Here's how you can find it 👇
|
||||
|
||||
<Image img={require('../../img/dash_output.png')} alt="Dashboard" />
|
||||
|
||||
[👋 Tell us if you need better privacy controls](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version?month=2023-08)
|
||||
|
||||
### 3. Review request log
|
||||
|
||||
Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider.
|
||||
|
||||
|
||||
|
||||
|
||||
Ah! So we can see that this request was made to a **Baseten** (see litellm_params > custom_llm_provider) for a model with ID - **7qQNLDB** (see model). The message sent was - `"Hey, how's it going?"` and the response received was - `"As an AI language model, I don't have feelings or emotions, but I can assist you with your queries. How can I assist you today?"`
|
||||
|
||||
<Image img={require('../../img/dashboard_log.png')} alt="Dashboard Log Row" />
|
||||
|
||||
:::info
|
||||
|
||||
🎉 Congratulations! You've successfully debugger your first log!
|
||||
|
||||
:::
|
|
@ -2,6 +2,15 @@ import Image from '@theme/IdealImage';
|
|||
|
||||
# Athina
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
[Athina](https://athina.ai/) is an evaluation framework and production monitoring platform for your LLM-powered app. Athina is designed to enhance the performance and reliability of AI applications through real-time monitoring, granular analytics, and plug-and-play evaluations.
|
||||
|
||||
<Image img={require('../../img/athina_dashboard.png')} />
|
||||
|
|
|
@ -1,5 +1,14 @@
|
|||
# Greenscale - Track LLM Spend and Responsible Usage
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
|
||||
|
||||
## Getting Started
|
||||
|
|
|
@ -1,4 +1,13 @@
|
|||
# Helicone Tutorial
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage.
|
||||
|
||||
## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Langfuse - Logging LLM Input/Output
|
||||
# 🔥 Langfuse - Logging LLM Input/Output
|
||||
|
||||
LangFuse is open Source Observability & Analytics for LLM Apps
|
||||
Detailed production traces and a granular view on quality, cost and latency
|
||||
|
@ -122,10 +122,12 @@ response = completion(
|
|||
metadata={
|
||||
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
|
||||
"generation_id": "gen-id22", # set langfuse Generation ID
|
||||
"parent_observation_id": "obs-id9" # set langfuse Parent Observation ID
|
||||
"version": "test-generation-version" # set langfuse Generation Version
|
||||
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
||||
"session_id": "session-1", # set langfuse Session ID
|
||||
"tags": ["tag1", "tag2"], # set langfuse Tags
|
||||
"trace_name": "new-trace-name" # set langfuse Trace Name
|
||||
"trace_id": "trace-id22", # set langfuse Trace ID
|
||||
"trace_metadata": {"key": "value"}, # set langfuse Trace Metadata
|
||||
"trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version)
|
||||
|
@ -147,9 +149,10 @@ print(response)
|
|||
You can also pass `metadata` as part of the request header with a `langfuse_*` prefix:
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'langfuse_trace_id: trace-id22' \
|
||||
curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'langfuse_trace_id: trace-id2' \
|
||||
--header 'langfuse_trace_user_id: user-id2' \
|
||||
--header 'langfuse_trace_metadata: {"key":"value"}' \
|
||||
--data '{
|
||||
|
@ -190,9 +193,10 @@ The following parameters can be updated on a continuation of a trace by passing
|
|||
|
||||
#### Generation Specific Parameters
|
||||
|
||||
* `generation_id` - Identifier for the generation, auto-generated by default
|
||||
* `generation_name` - Identifier for the generation, auto-generated by default
|
||||
* `prompt` - Langfuse prompt object used for the generation, defaults to None
|
||||
* `generation_id` - Identifier for the generation, auto-generated by default
|
||||
* `generation_name` - Identifier for the generation, auto-generated by default
|
||||
* `parent_observation_id` - Identifier for the parent observation, defaults to `None`
|
||||
* `prompt` - Langfuse prompt object used for the generation, defaults to `None`
|
||||
|
||||
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
|
||||
|
||||
|
|
|
@ -1,6 +1,16 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Langsmith - Logging LLM Input/Output
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
An all-in-one developer platform for every step of the application lifecycle
|
||||
https://smith.langchain.com/
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Logfire - Logging LLM Input/Output
|
||||
# 🔥 Logfire - Logging LLM Input/Output
|
||||
|
||||
Logfire is open Source Observability & Analytics for LLM Apps
|
||||
Detailed production traces and a granular view on quality, cost and latency
|
||||
|
@ -14,10 +14,14 @@ join our [discord](https://discord.gg/wuPM9dRgDw)
|
|||
|
||||
## Pre-Requisites
|
||||
|
||||
Ensure you have run `pip install logfire` for this integration
|
||||
Ensure you have installed the following packages to use this integration
|
||||
|
||||
```shell
|
||||
pip install logfire litellm
|
||||
pip install litellm
|
||||
|
||||
pip install opentelemetry-api==1.25.0
|
||||
pip install opentelemetry-sdk==1.25.0
|
||||
pip install opentelemetry-exporter-otlp==1.25.0
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
@ -25,8 +29,7 @@ pip install logfire litellm
|
|||
Get your Logfire token from [Logfire](https://logfire.pydantic.dev/)
|
||||
|
||||
```python
|
||||
litellm.success_callback = ["logfire"]
|
||||
litellm.failure_callback = ["logfire"] # logs errors to logfire
|
||||
litellm.callbacks = ["logfire"]
|
||||
```
|
||||
|
||||
```python
|
||||
|
|
|
@ -1,5 +1,13 @@
|
|||
# Lunary - Logging and tracing LLM input/output
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
[Lunary](https://lunary.ai/) is an open-source AI developer platform providing observability, prompt management, and evaluation tools for AI developers.
|
||||
|
||||
<video controls width='900' >
|
||||
|
|
|
@ -1,5 +1,16 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Promptlayer Tutorial
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
Promptlayer is a platform for prompt engineers. Log OpenAI requests. Search usage history. Track performance. Visually manage prompt templates.
|
||||
|
||||
<Image img={require('../../img/promptlayer.png')} />
|
||||
|
|
|
@ -1,5 +1,14 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
# Sentry - Log LLM Exceptions
|
||||
[Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration
|
||||
|
||||
|
|
|
@ -1,4 +1,12 @@
|
|||
# Supabase Tutorial
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
[Supabase](https://supabase.com/) is an open source Firebase alternative.
|
||||
Start your project with a Postgres database, Authentication, instant APIs, Edge Functions, Realtime subscriptions, Storage, and Vector embeddings.
|
||||
|
||||
|
|
|
@ -1,6 +1,16 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Weights & Biases - Logging LLM Input/Output
|
||||
|
||||
|
||||
:::tip
|
||||
|
||||
This is community maintained, Please make an issue if you run into a bug
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
:::
|
||||
|
||||
|
||||
Weights & Biases helps AI developers build better models faster https://wandb.ai
|
||||
|
||||
<Image img={require('../../img/wandb.png')} />
|
||||
|
|
|
@ -4,6 +4,7 @@ import TabItem from '@theme/TabItem';
|
|||
# Anthropic
|
||||
LiteLLM supports
|
||||
|
||||
- `claude-3.5`
|
||||
- `claude-3` (`claude-3-haiku-20240307`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`)
|
||||
- `claude-2`
|
||||
- `claude-2.1`
|
||||
|
@ -171,6 +172,7 @@ print(response)
|
|||
|------------------|--------------------------------------------|
|
||||
| claude-3-haiku | `completion('claude-3-haiku-20240307', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-3-opus | `completion('claude-3-opus-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-3-5-sonnet | `completion('claude-3-5-sonnet-20240620', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-3-sonnet | `completion('claude-3-sonnet-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-2.1 | `completion('claude-2.1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
| claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||
|
|
|
@ -68,6 +68,7 @@ response = litellm.completion(
|
|||
|
||||
| Model Name | Function Call |
|
||||
|------------------|----------------------------------------|
|
||||
| gpt-4o | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4 | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-0314 | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-0613 | `completion('azure/<your deployment name>', messages)` |
|
||||
|
@ -85,7 +86,8 @@ response = litellm.completion(
|
|||
## Azure OpenAI Vision Models
|
||||
| Model Name | Function Call |
|
||||
|-----------------------|-----------------------------------------------------------------|
|
||||
| gpt-4-vision | `response = completion(model="azure/<your deployment name>", messages=messages)` |
|
||||
| gpt-4-vision | `completion(model="azure/<your deployment name>", messages=messages)` |
|
||||
| gpt-4o | `completion('azure/<your deployment name>', messages)` |
|
||||
|
||||
#### Usage
|
||||
```python
|
||||
|
|
|
@ -623,6 +623,7 @@ Here's an example of using a bedrock model with LiteLLM
|
|||
|
||||
| Model Name | Command |
|
||||
|----------------------------|------------------------------------------------------------------|
|
||||
| Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
| Anthropic Claude-V3 Opus | `completion(model='bedrock/anthropic.claude-3-opus-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` |
|
||||
|
|
255
docs/my-website/docs/providers/codestral.md
Normal file
255
docs/my-website/docs/providers/codestral.md
Normal file
|
@ -0,0 +1,255 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Codestral API [Mistral AI]
|
||||
|
||||
Codestral is available in select code-completion plugins but can also be queried directly. See the documentation for more details.
|
||||
|
||||
## API Key
|
||||
```python
|
||||
# env variable
|
||||
os.environ['CODESTRAL_API_KEY']
|
||||
```
|
||||
|
||||
## FIM / Completions
|
||||
|
||||
:::info
|
||||
|
||||
Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createFIMCompletion
|
||||
|
||||
:::
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="no-streaming" label="No Streaming">
|
||||
|
||||
#### Sample Usage
|
||||
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
|
||||
os.environ['CODESTRAL_API_KEY']
|
||||
|
||||
response = await litellm.atext_completion(
|
||||
model="text-completion-codestral/codestral-2405",
|
||||
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
|
||||
suffix="return True", # optional
|
||||
temperature=0, # optional
|
||||
top_p=1, # optional
|
||||
max_tokens=10, # optional
|
||||
min_tokens=10, # optional
|
||||
seed=10, # optional
|
||||
stop=["return"], # optional
|
||||
)
|
||||
```
|
||||
|
||||
#### Expected Response
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "b41e0df599f94bc1a46ea9fcdbc2aabe",
|
||||
"object": "text_completion",
|
||||
"created": 1589478378,
|
||||
"model": "codestral-latest",
|
||||
"choices": [
|
||||
{
|
||||
"text": "\n assert is_odd(1)\n assert",
|
||||
"index": 0,
|
||||
"logprobs": null,
|
||||
"finish_reason": "length"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 5,
|
||||
"completion_tokens": 7,
|
||||
"total_tokens": 12
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="stream" label="Streaming">
|
||||
|
||||
#### Sample Usage - Streaming
|
||||
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
|
||||
os.environ['CODESTRAL_API_KEY']
|
||||
|
||||
response = await litellm.atext_completion(
|
||||
model="text-completion-codestral/codestral-2405",
|
||||
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
|
||||
suffix="return True", # optional
|
||||
temperature=0, # optional
|
||||
top_p=1, # optional
|
||||
stream=True,
|
||||
seed=10, # optional
|
||||
stop=["return"], # optional
|
||||
)
|
||||
|
||||
async for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
#### Expected Response
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "726025d3e2d645d09d475bb0d29e3640",
|
||||
"object": "text_completion",
|
||||
"created": 1718659669,
|
||||
"choices": [
|
||||
{
|
||||
"text": "This",
|
||||
"index": 0,
|
||||
"logprobs": null,
|
||||
"finish_reason": null
|
||||
}
|
||||
],
|
||||
"model": "codestral-2405",
|
||||
}
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Supported Models
|
||||
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
|
||||
|
||||
| Model Name | Function Call |
|
||||
|----------------|--------------------------------------------------------------|
|
||||
| Codestral Latest | `completion(model="text-completion-codestral/codestral-latest", messages)` |
|
||||
| Codestral 2405 | `completion(model="text-completion-codestral/codestral-2405", messages)`|
|
||||
|
||||
|
||||
|
||||
|
||||
## Chat Completions
|
||||
|
||||
:::info
|
||||
|
||||
Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createChatCompletion
|
||||
:::
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="no-streaming" label="No Streaming">
|
||||
|
||||
#### Sample Usage
|
||||
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
|
||||
os.environ['CODESTRAL_API_KEY']
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="codestral/codestral-latest",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hey, how's it going?",
|
||||
}
|
||||
],
|
||||
temperature=0.0, # optional
|
||||
top_p=1, # optional
|
||||
max_tokens=10, # optional
|
||||
safe_prompt=False, # optional
|
||||
seed=12, # optional
|
||||
)
|
||||
```
|
||||
|
||||
#### Expected Response
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "chatcmpl-123",
|
||||
"object": "chat.completion",
|
||||
"created": 1677652288,
|
||||
"model": "codestral/codestral-latest",
|
||||
"system_fingerprint": None,
|
||||
"choices": [{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "\n\nHello there, how may I assist you today?",
|
||||
},
|
||||
"logprobs": null,
|
||||
"finish_reason": "stop"
|
||||
}],
|
||||
"usage": {
|
||||
"prompt_tokens": 9,
|
||||
"completion_tokens": 12,
|
||||
"total_tokens": 21
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="stream" label="Streaming">
|
||||
|
||||
#### Sample Usage - Streaming
|
||||
|
||||
```python
|
||||
import os
|
||||
import litellm
|
||||
|
||||
os.environ['CODESTRAL_API_KEY']
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="codestral/codestral-latest",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hey, how's it going?",
|
||||
}
|
||||
],
|
||||
stream=True, # optional
|
||||
temperature=0.0, # optional
|
||||
top_p=1, # optional
|
||||
max_tokens=10, # optional
|
||||
safe_prompt=False, # optional
|
||||
seed=12, # optional
|
||||
)
|
||||
async for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
#### Expected Response
|
||||
|
||||
```json
|
||||
{
|
||||
"id":"chatcmpl-123",
|
||||
"object":"chat.completion.chunk",
|
||||
"created":1694268190,
|
||||
"model": "codestral/codestral-latest",
|
||||
"system_fingerprint": None,
|
||||
"choices":[
|
||||
{
|
||||
"index":0,
|
||||
"delta":{"role":"assistant","content":"gm"},
|
||||
"logprobs":null,
|
||||
" finish_reason":null
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Supported Models
|
||||
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
|
||||
|
||||
| Model Name | Function Call |
|
||||
|----------------|--------------------------------------------------------------|
|
||||
| Codestral Latest | `completion(model="codestral/codestral-latest", messages)` |
|
||||
| Codestral 2405 | `completion(model="codestral/codestral-2405", messages)`|
|
|
@ -1,6 +1,13 @@
|
|||
# DeepInfra
|
||||
https://deepinfra.com/
|
||||
|
||||
:::tip
|
||||
|
||||
**We support ALL DeepInfra models, just set `model=deepinfra/<any-model-on-deepinfra>` as a prefix when sending litellm requests**
|
||||
|
||||
:::
|
||||
|
||||
|
||||
## API Key
|
||||
```python
|
||||
# env variable
|
||||
|
@ -38,13 +45,11 @@ for chunk in response:
|
|||
## Chat Models
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| meta-llama/Meta-Llama-3-8B-Instruct | `completion(model="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct", messages)` |
|
||||
| meta-llama/Meta-Llama-3-70B-Instruct | `completion(model="deepinfra/meta-llama/Meta-Llama-3-70B-Instruct", messages)` |
|
||||
| meta-llama/Llama-2-70b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-70b-chat-hf", messages)` |
|
||||
| meta-llama/Llama-2-7b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-7b-chat-hf", messages)` |
|
||||
| meta-llama/Llama-2-13b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-13b-chat-hf", messages)` |
|
||||
| codellama/CodeLlama-34b-Instruct-hf | `completion(model="deepinfra/codellama/CodeLlama-34b-Instruct-hf", messages)` |
|
||||
| mistralai/Mistral-7B-Instruct-v0.1 | `completion(model="deepinfra/mistralai/Mistral-7B-Instruct-v0.1", messages)` |
|
||||
| jondurbin/airoboros-l2-70b-gpt4-1.4.1 | `completion(model="deepinfra/jondurbin/airoboros-l2-70b-gpt4-1.4.1", messages)` |
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -49,6 +49,6 @@ We support ALL Deepseek models, just set `deepseek/` as a prefix when sending co
|
|||
| Model Name | Function Call |
|
||||
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` |
|
||||
| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` |
|
||||
| deepseek-coder | `completion(model="deepseek/deepseek-coder", messages)` |
|
||||
|
||||
|
||||
|
|
|
@ -45,6 +45,52 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
## Tool Calling
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
# set env
|
||||
os.environ["GEMINI_API_KEY"] = ".."
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||
|
||||
response = completion(
|
||||
model="gemini/gemini-1.5-flash",
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
)
|
||||
# Add any assertions, here to check response args
|
||||
print(response)
|
||||
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
|
||||
assert isinstance(
|
||||
response.choices[0].message.tool_calls[0].function.arguments, str
|
||||
)
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
# Gemini-Pro-Vision
|
||||
LiteLLM Supports the following image types passed in `url`
|
||||
- Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
# Groq
|
||||
https://groq.com/
|
||||
|
||||
**We support ALL Groq models, just set `groq/` as a prefix when sending completion requests**
|
||||
:::tip
|
||||
|
||||
**We support ALL Groq models, just set `model=groq/<any-model-on-groq>` as a prefix when sending litellm requests**
|
||||
|
||||
:::
|
||||
|
||||
## API Key
|
||||
```python
|
||||
|
|
|
@ -223,6 +223,17 @@ response = completion(
|
|||
|
||||
```
|
||||
|
||||
## OpenAI Fine Tuned Models
|
||||
|
||||
| Model Name | Function Call |
|
||||
|---------------------------|-----------------------------------------------------------------|
|
||||
| fine tuned `gpt-4-0613` | `response = completion(model="ft:gpt-4-0613", messages=messages)` |
|
||||
| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` |
|
||||
| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` |
|
||||
| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` |
|
||||
| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |
|
||||
|
||||
|
||||
## Advanced
|
||||
|
||||
### Parallel Function calling
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# OpenAI (Text Completion)
|
||||
|
||||
LiteLLM supports OpenAI text completion models
|
||||
|
|
|
@ -208,7 +208,7 @@ print(response)
|
|||
|
||||
Instead of using the `custom_llm_provider` arg to specify which provider you're using (e.g. together ai), you can just pass the provider name as part of the model name, and LiteLLM will parse it out.
|
||||
|
||||
Expected format: <custom_llm_provider>/<model_name>
|
||||
Expected format: `<custom_llm_provider>/<model_name>`
|
||||
|
||||
e.g. completion(model="together_ai/togethercomputer/Llama-2-7B-32K-Instruct", ...)
|
||||
|
||||
|
|
|
@ -8,6 +8,152 @@ import TabItem from '@theme/TabItem';
|
|||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
</a>
|
||||
|
||||
## 🆕 `vertex_ai_beta/` route
|
||||
|
||||
New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk).
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import json
|
||||
|
||||
## GET CREDENTIALS
|
||||
file_path = 'path/to/vertex_ai_service_account.json'
|
||||
|
||||
# Load the JSON file
|
||||
with open(file_path, 'r') as file:
|
||||
vertex_credentials = json.load(file)
|
||||
|
||||
# Convert to JSON string
|
||||
vertex_credentials_json = json.dumps(vertex_credentials)
|
||||
|
||||
## COMPLETION CALL
|
||||
response = completion(
|
||||
model="vertex_ai_beta/gemini-pro",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
vertex_credentials=vertex_credentials_json
|
||||
)
|
||||
```
|
||||
|
||||
### **System Message**
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import json
|
||||
|
||||
## GET CREDENTIALS
|
||||
file_path = 'path/to/vertex_ai_service_account.json'
|
||||
|
||||
# Load the JSON file
|
||||
with open(file_path, 'r') as file:
|
||||
vertex_credentials = json.load(file)
|
||||
|
||||
# Convert to JSON string
|
||||
vertex_credentials_json = json.dumps(vertex_credentials)
|
||||
|
||||
|
||||
response = completion(
|
||||
model="vertex_ai_beta/gemini-pro",
|
||||
messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}],
|
||||
vertex_credentials=vertex_credentials_json
|
||||
)
|
||||
```
|
||||
|
||||
### **Function Calling**
|
||||
|
||||
Force Gemini to make tool calls with `tool_choice="required"`.
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import json
|
||||
|
||||
## GET CREDENTIALS
|
||||
file_path = 'path/to/vertex_ai_service_account.json'
|
||||
|
||||
# Load the JSON file
|
||||
with open(file_path, 'r') as file:
|
||||
vertex_credentials = json.load(file)
|
||||
|
||||
# Convert to JSON string
|
||||
vertex_credentials_json = json.dumps(vertex_credentials)
|
||||
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Your name is Litellm Bot, you are a helpful assistant",
|
||||
},
|
||||
# User asks for their name and weather in San Francisco
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, what is your name and can you tell me the weather?",
|
||||
},
|
||||
]
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
}
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
data = {
|
||||
"model": "vertex_ai_beta/gemini-1.5-pro-preview-0514"),
|
||||
"messages": messages,
|
||||
"tools": tools,
|
||||
"tool_choice": "required",
|
||||
"vertex_credentials": vertex_credentials_json
|
||||
}
|
||||
|
||||
## COMPLETION CALL
|
||||
print(completion(**data))
|
||||
```
|
||||
|
||||
### **JSON Schema**
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
|
||||
## GET CREDENTIALS
|
||||
file_path = 'path/to/vertex_ai_service_account.json'
|
||||
|
||||
# Load the JSON file
|
||||
with open(file_path, 'r') as file:
|
||||
vertex_credentials = json.load(file)
|
||||
|
||||
# Convert to JSON string
|
||||
vertex_credentials_json = json.dumps(vertex_credentials)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
List 5 popular cookie recipes.
|
||||
|
||||
Using this JSON schema:
|
||||
|
||||
Recipe = {"recipe_name": str}
|
||||
|
||||
Return a `list[Recipe]`
|
||||
"""
|
||||
}
|
||||
]
|
||||
|
||||
completion(model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" })
|
||||
```
|
||||
|
||||
## Pre-requisites
|
||||
* `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
|
||||
* Authentication:
|
||||
|
@ -140,7 +286,7 @@ In certain use-cases you may need to make calls to the models and pass [safety s
|
|||
|
||||
```python
|
||||
response = completion(
|
||||
model="gemini/gemini-pro",
|
||||
model="vertex_ai/gemini-pro",
|
||||
messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
|
||||
safety_settings=[
|
||||
{
|
||||
|
@ -254,6 +400,7 @@ litellm.vertex_location = "us-central1 # Your Location
|
|||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| claude-3-opus@20240229 | `completion('vertex_ai/claude-3-opus@20240229', messages)` |
|
||||
| claude-3-5-sonnet@20240620 | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
|
||||
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
|
||||
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
|
||||
|
||||
|
@ -363,8 +510,8 @@ response = completion(
|
|||
## Gemini 1.5 Pro (and Vision)
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| gemini-1.5-pro | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
|
||||
| gemini-1.5-flash-preview-0514 | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
|
||||
| gemini-1.5-pro | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-1.5-pro', messages)` |
|
||||
| gemini-1.5-flash-preview-0514 | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-flash-preview-0514', messages)` |
|
||||
| gemini-1.5-pro-preview-0514 | `completion('gemini-1.5-pro-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-pro-preview-0514', messages)` |
|
||||
|
||||
|
||||
|
@ -680,6 +827,3 @@ s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# 🚨 Alerting / Webhooks
|
||||
|
||||
Get alerts for:
|
||||
|
@ -15,6 +17,11 @@ Get alerts for:
|
|||
- **Spend** Weekly & Monthly spend per Team, Tag
|
||||
|
||||
|
||||
Works across:
|
||||
- [Slack](#quick-start)
|
||||
- [Discord](#advanced---using-discord-webhooks)
|
||||
- [Microsoft Teams](#advanced---using-ms-teams-webhooks)
|
||||
|
||||
## Quick Start
|
||||
|
||||
Set up a slack alert channel to receive alerts from proxy.
|
||||
|
@ -25,41 +32,33 @@ Get a slack webhook url from https://api.slack.com/messaging/webhooks
|
|||
|
||||
You can also use Discord Webhooks, see [here](#using-discord-webhooks)
|
||||
|
||||
### Step 2: Update config.yaml
|
||||
|
||||
- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
|
||||
- Just for testing purposes, let's save a bad key to our proxy.
|
||||
Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
|
||||
|
||||
```bash
|
||||
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
|
||||
```
|
||||
|
||||
### Step 2: Setup Proxy
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
model_name: "azure-model"
|
||||
litellm_params:
|
||||
model: "azure/gpt-35-turbo"
|
||||
api_key: "my-bad-key" # 👈 bad key
|
||||
|
||||
general_settings:
|
||||
alerting: ["slack"]
|
||||
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
|
||||
|
||||
environment_variables:
|
||||
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
|
||||
SLACK_DAILY_REPORT_FREQUENCY: "86400" # 24 hours; Optional: defaults to 12 hours
|
||||
```
|
||||
|
||||
|
||||
### Step 3: Start proxy
|
||||
|
||||
Start proxy
|
||||
```bash
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
## Testing Alerting is Setup Correctly
|
||||
|
||||
Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel
|
||||
### Step 3: Test it!
|
||||
|
||||
```shell
|
||||
curl -X GET 'http://localhost:4000/health/services?service=slack' \
|
||||
-H 'Authorization: Bearer sk-1234'
|
||||
|
||||
```bash
|
||||
curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \
|
||||
-H 'Authorization: Bearer sk-1234'
|
||||
```
|
||||
|
||||
## Advanced - Redacting Messages from Alerts
|
||||
|
@ -77,7 +76,34 @@ litellm_settings:
|
|||
```
|
||||
|
||||
|
||||
## Advanced - Add Metadata to alerts
|
||||
|
||||
Add alerting metadata to proxy calls for debugging.
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [],
|
||||
extra_body={
|
||||
"metadata": {
|
||||
"alerting_metadata": {
|
||||
"hello": "world"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
**Expected Response**
|
||||
|
||||
<Image img={require('../../img/alerting_metadata.png')}/>
|
||||
|
||||
## Advanced - Opting into specific alert types
|
||||
|
||||
|
@ -108,6 +134,48 @@ AlertType = Literal[
|
|||
```
|
||||
|
||||
|
||||
## Advanced - Using MS Teams Webhooks
|
||||
|
||||
MS Teams provides a slack compatible webhook url that you can use for alerting
|
||||
|
||||
##### Quick Start
|
||||
|
||||
1. [Get a webhook url](https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook?tabs=newteams%2Cdotnet#create-an-incoming-webhook) for your Microsoft Teams channel
|
||||
|
||||
2. Add it to your .env
|
||||
|
||||
```bash
|
||||
SLACK_WEBHOOK_URL="https://berriai.webhook.office.com/webhookb2/...6901/IncomingWebhook/b55fa0c2a48647be8e6effedcd540266/e04b1092-4a3e-44a2-ab6b-29a0a4854d1d"
|
||||
```
|
||||
|
||||
3. Add it to your litellm config
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
model_name: "azure-model"
|
||||
litellm_params:
|
||||
model: "azure/gpt-35-turbo"
|
||||
api_key: "my-bad-key" # 👈 bad key
|
||||
|
||||
general_settings:
|
||||
alerting: ["slack"]
|
||||
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
|
||||
```
|
||||
|
||||
4. Run health check!
|
||||
|
||||
Call the proxy `/health/services` endpoint to test if your alerting connection is correctly setup.
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/health/services?service=slack' \
|
||||
--header 'Authorization: Bearer sk-1234'
|
||||
```
|
||||
|
||||
|
||||
**Expected Response**
|
||||
|
||||
<Image img={require('../../img/ms_teams_alerting.png')}/>
|
||||
|
||||
## Advanced - Using Discord Webhooks
|
||||
|
||||
Discord provides a slack compatible webhook url that you can use for alerting
|
||||
|
@ -139,7 +207,6 @@ environment_variables:
|
|||
SLACK_WEBHOOK_URL: "https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
|
||||
```
|
||||
|
||||
That's it ! You're ready to go !
|
||||
|
||||
## Advanced - [BETA] Webhooks for Budget Alerts
|
||||
|
||||
|
|
|
@ -252,6 +252,31 @@ $ litellm --config /path/to/config.yaml
|
|||
```
|
||||
|
||||
|
||||
## Multiple OpenAI Organizations
|
||||
|
||||
Add all openai models across all OpenAI organizations with just 1 model definition
|
||||
|
||||
```yaml
|
||||
- model_name: *
|
||||
litellm_params:
|
||||
model: openai/*
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
organization:
|
||||
- org-1
|
||||
- org-2
|
||||
- org-3
|
||||
```
|
||||
|
||||
LiteLLM will automatically create separate deployments for each org.
|
||||
|
||||
Confirm this via
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/v1/model/info' \
|
||||
--header 'Authorization: Bearer ${LITELLM_KEY}' \
|
||||
--data ''
|
||||
```
|
||||
|
||||
## Load Balancing
|
||||
|
||||
:::info
|
||||
|
|
|
@ -27,7 +27,7 @@ docker-compose up
|
|||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="basic" label="Basic">
|
||||
<TabItem value="basic" label="Basic (No DB)">
|
||||
|
||||
### Step 1. CREATE config.yaml
|
||||
|
||||
|
@ -98,7 +98,13 @@ docker run ghcr.io/berriai/litellm:main-latest --port 8002 --num_workers 8
|
|||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="terraform" label="Terraform">
|
||||
|
||||
s/o [Nicholas Cecere](https://www.linkedin.com/in/nicholas-cecere-24243549/) for his LiteLLM User Management Terraform
|
||||
|
||||
👉 [Go here for Terraform](https://github.com/ncecere/terraform-litellm-user-mgmt)
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="base-image" label="use litellm as a base image">
|
||||
|
||||
```shell
|
||||
|
@ -380,6 +386,7 @@ kubectl port-forward service/litellm-service 4000:4000
|
|||
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="helm-deploy" label="Helm">
|
||||
|
||||
|
||||
|
@ -425,7 +432,6 @@ If you need to set your litellm proxy config.yaml, you can find this in [values.
|
|||
|
||||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="helm-oci" label="Helm OCI Registry (GHCR)">
|
||||
|
||||
:::info
|
||||
|
@ -669,7 +675,7 @@ Once the stack is created, get the DatabaseURL of the Database resource, copy th
|
|||
#### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
|
||||
From the EC2 console, connect to the instance created by the stack (e.g., using SSH).
|
||||
|
||||
Run the following command, replacing <database_url> with the value you copied in step 2
|
||||
Run the following command, replacing `<database_url>` with the value you copied in step 2
|
||||
|
||||
```shell
|
||||
docker run --name litellm-proxy \
|
||||
|
|
|
@ -5,6 +5,7 @@ import Image from '@theme/IdealImage';
|
|||
Send an Email to your users when:
|
||||
- A Proxy API Key is created for them
|
||||
- Their API Key crosses it's Budget
|
||||
- All Team members of a LiteLLM Team -> when the team crosses it's budget
|
||||
|
||||
<Image img={require('../../img/email_notifs.png')} style={{ width: '500px' }}/>
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Model Management
|
||||
Add new models + Get model info without restarting proxy.
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# LiteLLM Proxy Performance
|
||||
|
||||
### Throughput - 30% Increase
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Grafana, Prometheus metrics [BETA]
|
||||
# 📈 Prometheus metrics [BETA]
|
||||
|
||||
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
|
||||
|
||||
|
@ -54,6 +54,13 @@ http://localhost:4000/metrics
|
|||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
||||
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
||||
|
||||
### Budget Metrics
|
||||
| Metric Name | Description |
|
||||
|----------------------|--------------------------------------|
|
||||
| `litellm_remaining_team_budget_metric` | Remaining Budget for Team (A team created on LiteLLM) |
|
||||
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
|
||||
|
||||
|
||||
## Monitor System Health
|
||||
|
||||
To monitor the health of litellm adjacent services (redis / postgres), do:
|
||||
|
|
|
@ -409,6 +409,28 @@ print(response)
|
|||
</Tabs>
|
||||
|
||||
|
||||
### Content Policy Fallbacks
|
||||
|
||||
Fallback across providers (e.g. from Azure OpenAI to Anthropic) if you hit content policy violation errors.
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo-small
|
||||
litellm_params:
|
||||
model: azure/chatgpt-v-2
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
|
||||
- model_name: claude-opus
|
||||
litellm_params:
|
||||
model: claude-3-opus-20240229
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
litellm_settings:
|
||||
content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}]
|
||||
```
|
||||
|
||||
### EU-Region Filtering (Pre-Call Checks)
|
||||
|
||||
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
|
||||
|
|
|
@ -123,4 +123,18 @@ LiteLLM Enterprise: Enable [SSO login](./ui.md#setup-ssoauth-for-ui)
|
|||
4. User can now create their own keys
|
||||
|
||||
|
||||
<Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} />
|
||||
<Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} />
|
||||
|
||||
|
||||
## Advanced
|
||||
### Setting custom logout URLs
|
||||
|
||||
Set `PROXY_LOGOUT_URL` in your .env if you want users to get redirected to a specific URL when they click logout
|
||||
|
||||
```
|
||||
export PROXY_LOGOUT_URL="https://www.google.com"
|
||||
```
|
||||
|
||||
<Image img={require('../../img/ui_logout.png')} style={{ width: '400px', height: 'auto' }} />
|
||||
|
||||
|
||||
|
|
154
docs/my-website/docs/proxy/team_budgets.md
Normal file
154
docs/my-website/docs/proxy/team_budgets.md
Normal file
|
@ -0,0 +1,154 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# 💰 Setting Team Budgets
|
||||
|
||||
Track spend, set budgets for your Internal Team
|
||||
|
||||
## Setting Monthly Team Budgets
|
||||
|
||||
### 1. Create a team
|
||||
- Set `max_budget=000000001` ($ value the team is allowed to spend)
|
||||
- Set `budget_duration="1d"` (How frequently the budget should update)
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="API" label="API">
|
||||
|
||||
Create a new team and set `max_budget` and `budget_duration`
|
||||
```shell
|
||||
curl -X POST 'http://0.0.0.0:4000/team/new' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"team_alias": "QA Prod Bot",
|
||||
"max_budget": 0.000000001,
|
||||
"budget_duration": "1d"
|
||||
}'
|
||||
```
|
||||
|
||||
Response
|
||||
```shell
|
||||
{
|
||||
"team_alias": "QA Prod Bot",
|
||||
"team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a",
|
||||
"max_budget": 0.0001,
|
||||
"budget_duration": "1d",
|
||||
"budget_reset_at": "2024-06-14T22:48:36.594000Z"
|
||||
}
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="UI" label="Admin UI">
|
||||
<Image img={require('../../img/create_team_gif_good.gif')} />
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
</Tabs>
|
||||
|
||||
Possible values for `budget_duration`
|
||||
|
||||
| `budget_duration` | When Budget will reset |
|
||||
| --- | --- |
|
||||
| `budget_duration="1s"` | every 1 second |
|
||||
| `budget_duration="1m"` | every 1 min |
|
||||
| `budget_duration="1h"` | every 1 hour |
|
||||
| `budget_duration="1d"` | every 1 day |
|
||||
| `budget_duration="1mo"` | every 1 month |
|
||||
|
||||
|
||||
### 2. Create a key for the `team`
|
||||
|
||||
Create a key for Team=`QA Prod Bot` and `team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"` from Step 1
|
||||
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="api" label="API">
|
||||
|
||||
💡 **The Budget for Team="QA Prod Bot" budget will apply to this team**
|
||||
|
||||
```shell
|
||||
curl -X POST 'http://0.0.0.0:4000/key/generate' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a"}'
|
||||
```
|
||||
|
||||
Response
|
||||
|
||||
```shell
|
||||
{"team_id":"de35b29e-6ca8-4f47-b804-2b79d07aa99a", "key":"sk-5qtncoYjzRcxMM4bDRktNQ"}
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="UI" label="Admin UI">
|
||||
<Image img={require('../../img/create_key_in_team.gif')} />
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
### 3. Test It
|
||||
|
||||
Use the key from step 2 and run this Request twice
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="api" label="API">
|
||||
|
||||
```shell
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Authorization: Bearer sk-mso-JSykEGri86KyOvgxBw' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d ' {
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
On the 2nd response - expect to see the following exception
|
||||
|
||||
```shell
|
||||
{
|
||||
"error": {
|
||||
"message": "Budget has been exceeded! Current cost: 3.5e-06, Max budget: 1e-09",
|
||||
"type": "auth_error",
|
||||
"param": null,
|
||||
"code": 400
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="UI" label="Admin UI">
|
||||
<Image img={require('../../img/test_key_budget.gif')} />
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Advanced
|
||||
|
||||
### Prometheus metrics for `remaining_budget`
|
||||
|
||||
[More info about Prometheus metrics here](https://docs.litellm.ai/docs/proxy/prometheus)
|
||||
|
||||
You'll need the following in your proxy config.yaml
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
success_callback: ["prometheus"]
|
||||
failure_callback: ["prometheus"]
|
||||
```
|
||||
|
||||
Expect to see this metric on prometheus to track the Remaining Budget for the team
|
||||
|
||||
```shell
|
||||
litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"} 9.699999999999992e-06
|
||||
```
|
||||
|
||||
|
|
@ -62,6 +62,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|||
You can:
|
||||
- Add budgets to Teams
|
||||
|
||||
:::info
|
||||
|
||||
**Step-by step tutorial on setting, resetting budgets on Teams here (API or using Admin UI)**
|
||||
|
||||
👉 [https://docs.litellm.ai/docs/proxy/team_budgets](https://docs.litellm.ai/docs/proxy/team_budgets)
|
||||
|
||||
:::
|
||||
|
||||
|
||||
#### **Add budgets to teams**
|
||||
```shell
|
||||
|
@ -413,6 +421,63 @@ curl 'http://0.0.0.0:4000/key/generate' \
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Reset Budgets
|
||||
|
||||
Reset budgets across keys/internal users/teams/customers
|
||||
|
||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="users" label="Internal Users">
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s, # 👈 KEY CHANGE
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="keys" label="Keys">
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s, # 👈 KEY CHANGE
|
||||
}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="teams" label="Teams">
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/team/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s, # 👈 KEY CHANGE
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Note:** By default, the server checks for resets every 10 minutes, to minimize DB calls.
|
||||
|
||||
To change this, set `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`
|
||||
|
||||
E.g.: Check every 1 seconds
|
||||
```yaml
|
||||
general_settings:
|
||||
proxy_budget_rescheduler_min_time: 1
|
||||
proxy_budget_rescheduler_max_time: 1
|
||||
```
|
||||
|
||||
## Set Rate Limits
|
||||
|
||||
You can set:
|
||||
|
|
|
@ -95,7 +95,7 @@ print(response)
|
|||
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
|
||||
- `router.aimage_generation()` - async image generation calls
|
||||
|
||||
## Advanced - Routing Strategies
|
||||
## Advanced - Routing Strategies ⭐️
|
||||
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
|
||||
|
||||
Router provides 4 strategies for routing your calls across multiple deployments:
|
||||
|
@ -262,7 +262,7 @@ if response is not None:
|
|||
)
|
||||
```
|
||||
|
||||
### Set Time Window
|
||||
#### Set Time Window
|
||||
|
||||
Set time window for how far back to consider when averaging latency for a deployment.
|
||||
|
||||
|
@ -278,7 +278,7 @@ router_settings:
|
|||
routing_strategy_args: {"ttl": 10}
|
||||
```
|
||||
|
||||
### Set Lowest Latency Buffer
|
||||
#### Set Lowest Latency Buffer
|
||||
|
||||
Set a buffer within which deployments are candidates for making calls to.
|
||||
|
||||
|
@ -468,6 +468,122 @@ asyncio.run(router_acompletion())
|
|||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="custom" label="Custom Routing Strategy">
|
||||
|
||||
**Plugin a custom routing strategy to select deployments**
|
||||
|
||||
|
||||
Step 1. Define your custom routing strategy
|
||||
|
||||
```python
|
||||
|
||||
from litellm.router import CustomRoutingStrategyBase
|
||||
class CustomRoutingStrategy(CustomRoutingStrategyBase):
|
||||
async def async_get_available_deployment(
|
||||
self,
|
||||
model: str,
|
||||
messages: Optional[List[Dict[str, str]]] = None,
|
||||
input: Optional[Union[str, List]] = None,
|
||||
specific_deployment: Optional[bool] = False,
|
||||
request_kwargs: Optional[Dict] = None,
|
||||
):
|
||||
"""
|
||||
Asynchronously retrieves the available deployment based on the given parameters.
|
||||
|
||||
Args:
|
||||
model (str): The name of the model.
|
||||
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
|
||||
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
|
||||
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
|
||||
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Returns an element from litellm.router.model_list
|
||||
|
||||
"""
|
||||
print("In CUSTOM async get available deployment")
|
||||
model_list = router.model_list
|
||||
print("router model list=", model_list)
|
||||
for model in model_list:
|
||||
if isinstance(model, dict):
|
||||
if model["litellm_params"]["model"] == "openai/very-special-endpoint":
|
||||
return model
|
||||
pass
|
||||
|
||||
def get_available_deployment(
|
||||
self,
|
||||
model: str,
|
||||
messages: Optional[List[Dict[str, str]]] = None,
|
||||
input: Optional[Union[str, List]] = None,
|
||||
specific_deployment: Optional[bool] = False,
|
||||
request_kwargs: Optional[Dict] = None,
|
||||
):
|
||||
"""
|
||||
Synchronously retrieves the available deployment based on the given parameters.
|
||||
|
||||
Args:
|
||||
model (str): The name of the model.
|
||||
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
|
||||
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
|
||||
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
|
||||
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Returns an element from litellm.router.model_list
|
||||
|
||||
"""
|
||||
pass
|
||||
```
|
||||
|
||||
Step 2. Initialize Router with custom routing strategy
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "openai/very-special-endpoint",
|
||||
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/", # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)
|
||||
"api_key": "fake-key",
|
||||
},
|
||||
"model_info": {"id": "very-special-endpoint"},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-model",
|
||||
"litellm_params": {
|
||||
"model": "openai/fast-endpoint",
|
||||
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||
"api_key": "fake-key",
|
||||
},
|
||||
"model_info": {"id": "fast-endpoint"},
|
||||
},
|
||||
],
|
||||
set_verbose=True,
|
||||
debug_level="DEBUG",
|
||||
timeout=1,
|
||||
) # type: ignore
|
||||
|
||||
router.set_custom_routing_strategy(CustomRoutingStrategy()) # 👈 Set your routing strategy here
|
||||
```
|
||||
|
||||
Step 3. Test your routing strategy. Expect your custom routing strategy to be called when running `router.acompletion` requests
|
||||
```python
|
||||
for _ in range(10):
|
||||
response = await router.acompletion(
|
||||
model="azure-model", messages=[{"role": "user", "content": "hello"}]
|
||||
)
|
||||
print(response)
|
||||
_picked_model_id = response._hidden_params["model_id"]
|
||||
print("picked model=", _picked_model_id)
|
||||
```
|
||||
|
||||
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
|
||||
|
||||
Picks a deployment based on the lowest cost
|
||||
|
@ -563,7 +679,6 @@ asyncio.run(router_acompletion())
|
|||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
## Basic Reliability
|
||||
|
@ -790,85 +905,205 @@ If the error is a context window exceeded error, fall back to a larger model gro
|
|||
|
||||
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
||||
|
||||
You can also set 'default_fallbacks', in case a specific model group is misconfigured / bad.
|
||||
You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad.
|
||||
|
||||
There are 3 types of fallbacks:
|
||||
- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
|
||||
- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
|
||||
- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
|
||||
|
||||
**Content Policy Violation Fallback**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
model_list = [
|
||||
{ # list of model deployments
|
||||
"model_name": "azure/gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": "bad-key",
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
},
|
||||
"tpm": 240000,
|
||||
"rpm": 1800
|
||||
},
|
||||
{ # list of model deployments
|
||||
"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": "bad-key",
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
},
|
||||
"tpm": 240000,
|
||||
"rpm": 1800
|
||||
},
|
||||
{
|
||||
"model_name": "azure/gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-functioncalling",
|
||||
"api_key": "bad-key",
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE")
|
||||
},
|
||||
"tpm": 240000,
|
||||
"rpm": 1800
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
"tpm": 1000000,
|
||||
"rpm": 9000
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo-16k", # openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo-16k",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
"tpm": 1000000,
|
||||
"rpm": 9000
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
router = Router(model_list=model_list,
|
||||
fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
|
||||
default_fallbacks=["gpt-3.5-turbo-16k"],
|
||||
context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
|
||||
set_verbose=True)
|
||||
|
||||
|
||||
user_message = "Hello, whats the weather in San Francisco??"
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
# normal fallback call
|
||||
response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
|
||||
|
||||
# context window fallback call
|
||||
response = router.completion(model="azure/gpt-3.5-turbo-context-fallback", messages=messages)
|
||||
|
||||
print(f"response: {response}")
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("content filtering policy"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Context Window Exceeded Fallback**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("prompt is too long"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# fallbacks=[..], # [OPTIONAL]
|
||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Regular Fallbacks**
|
||||
|
||||
Key change:
|
||||
|
||||
```python
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "claude-2",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": Exception("this is a rate limit error"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "my-fallback-model",
|
||||
"litellm_params": {
|
||||
"model": "claude-2",
|
||||
"api_key": "",
|
||||
"mock_response": "This works!",
|
||||
},
|
||||
},
|
||||
],
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
|
||||
# context_window_fallbacks=[..], # [OPTIONAL]
|
||||
# content_policy_fallbacks=[..], # [OPTIONAL]
|
||||
)
|
||||
|
||||
response = router.completion(
|
||||
model="claude-2",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="PROXY">
|
||||
|
||||
In your proxy config.yaml just add this line 👇
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
fallbacks=[{"claude-2": ["my-fallback-model"]}]
|
||||
```
|
||||
|
||||
Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Caching
|
||||
|
||||
In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching.
|
||||
|
|
|
@ -23,9 +23,13 @@ https://api.together.xyz/playground/chat?model=togethercomputer%2Fllama-2-70b-ch
|
|||
model_name = "together_ai/togethercomputer/llama-2-70b-chat"
|
||||
response = completion(model=model_name, messages=messages)
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
|
||||
{'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': "\n\nI'm not able to provide real-time weather information. However, I can suggest"}}], 'created': 1691629657.9288375, 'model': 'togethercomputer/llama-2-70b-chat', 'usage': {'prompt_tokens': 9, 'completion_tokens': 17, 'total_tokens': 26}}
|
||||
```
|
||||
|
||||
|
||||
LiteLLM handles the prompt formatting for Together AI's Llama2 models as well, converting your message to the
|
||||
|
|
|
@ -38,9 +38,6 @@ const config = {
|
|||
disableInDev: false,
|
||||
},
|
||||
],
|
||||
[ require.resolve('docusaurus-lunr-search'), {
|
||||
languages: ['en'] // language codes
|
||||
}],
|
||||
() => ({
|
||||
name: 'cripchat',
|
||||
injectHtmlTags() {
|
||||
|
@ -90,6 +87,15 @@ const config = {
|
|||
({
|
||||
// Replace with your project's social card
|
||||
image: 'img/docusaurus-social-card.png',
|
||||
algolia: {
|
||||
// The application ID provided by Algolia
|
||||
appId: 'NU85Y4NU0B',
|
||||
|
||||
// Public API key: it is safe to commit it
|
||||
apiKey: '4e0cf8c3020d0c876ad9174cea5c01fb',
|
||||
|
||||
indexName: 'litellm',
|
||||
},
|
||||
navbar: {
|
||||
title: '🚅 LiteLLM',
|
||||
items: [
|
||||
|
@ -138,8 +144,8 @@ const config = {
|
|||
title: 'Docs',
|
||||
items: [
|
||||
{
|
||||
label: 'Tutorial',
|
||||
to: '/docs/index',
|
||||
label: 'Getting Started',
|
||||
to: 'https://docs.litellm.ai/docs/',
|
||||
},
|
||||
],
|
||||
},
|
||||
|
|
BIN
docs/my-website/img/alerting_metadata.png
Normal file
BIN
docs/my-website/img/alerting_metadata.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 207 KiB |
BIN
docs/my-website/img/create_key_in_team.gif
Normal file
BIN
docs/my-website/img/create_key_in_team.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.2 MiB |
BIN
docs/my-website/img/create_team_gif_good.gif
Normal file
BIN
docs/my-website/img/create_team_gif_good.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.7 MiB |
BIN
docs/my-website/img/ms_teams_alerting.png
Normal file
BIN
docs/my-website/img/ms_teams_alerting.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 241 KiB |
BIN
docs/my-website/img/test_key_budget.gif
Normal file
BIN
docs/my-website/img/test_key_budget.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 MiB |
BIN
docs/my-website/img/ui_logout.png
Normal file
BIN
docs/my-website/img/ui_logout.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
2308
docs/my-website/package-lock.json
generated
2308
docs/my-website/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -23,8 +23,8 @@
|
|||
"docusaurus": "^1.14.7",
|
||||
"docusaurus-lunr-search": "^2.4.1",
|
||||
"prism-react-renderer": "^1.3.5",
|
||||
"react": "^17.0.2",
|
||||
"react-dom": "^17.0.2",
|
||||
"react": "^18.1.0",
|
||||
"react-dom": "^18.1.0",
|
||||
"sharp": "^0.32.6",
|
||||
"uuid": "^9.0.1"
|
||||
},
|
||||
|
|
|
@ -43,6 +43,7 @@ const sidebars = {
|
|||
"proxy/cost_tracking",
|
||||
"proxy/self_serve",
|
||||
"proxy/users",
|
||||
"proxy/team_budgets",
|
||||
"proxy/customers",
|
||||
"proxy/billing",
|
||||
"proxy/user_keys",
|
||||
|
@ -54,6 +55,7 @@ const sidebars = {
|
|||
items: ["proxy/logging", "proxy/streaming_logging"],
|
||||
},
|
||||
"proxy/ui",
|
||||
"proxy/prometheus",
|
||||
"proxy/email",
|
||||
"proxy/multiple_admins",
|
||||
"proxy/team_based_routing",
|
||||
|
@ -70,7 +72,6 @@ const sidebars = {
|
|||
"proxy/pii_masking",
|
||||
"proxy/prompt_injection",
|
||||
"proxy/caching",
|
||||
"proxy/prometheus",
|
||||
"proxy/call_hooks",
|
||||
"proxy/rules",
|
||||
"proxy/cli",
|
||||
|
@ -87,6 +88,7 @@ const sidebars = {
|
|||
},
|
||||
items: [
|
||||
"completion/input",
|
||||
"completion/drop_params",
|
||||
"completion/prompt_formatting",
|
||||
"completion/output",
|
||||
"exception_mapping",
|
||||
|
@ -133,10 +135,11 @@ const sidebars = {
|
|||
"providers/vertex",
|
||||
"providers/palm",
|
||||
"providers/gemini",
|
||||
"providers/mistral",
|
||||
"providers/anthropic",
|
||||
"providers/aws_sagemaker",
|
||||
"providers/bedrock",
|
||||
"providers/mistral",
|
||||
"providers/codestral",
|
||||
"providers/cohere",
|
||||
"providers/anyscale",
|
||||
"providers/huggingface",
|
||||
|
@ -170,10 +173,8 @@ const sidebars = {
|
|||
"proxy/custom_pricing",
|
||||
"routing",
|
||||
"scheduler",
|
||||
"rules",
|
||||
"set_keys",
|
||||
"budget_manager",
|
||||
"contributing",
|
||||
"secret",
|
||||
"completion/token_usage",
|
||||
"load_test",
|
||||
|
@ -181,11 +182,11 @@ const sidebars = {
|
|||
type: "category",
|
||||
label: "Logging & Observability",
|
||||
items: [
|
||||
"observability/langfuse_integration",
|
||||
"observability/logfire_integration",
|
||||
"debugging/local_debugging",
|
||||
"observability/raw_request_response",
|
||||
"observability/callbacks",
|
||||
"observability/custom_callback",
|
||||
"observability/langfuse_integration",
|
||||
"observability/sentry",
|
||||
"observability/lago",
|
||||
"observability/openmeter",
|
||||
|
@ -223,14 +224,16 @@ const sidebars = {
|
|||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "LangChain, LlamaIndex Integration",
|
||||
items: ["langchain/langchain"],
|
||||
label: "LangChain, LlamaIndex, Instructor Integration",
|
||||
items: ["langchain/langchain", "tutorials/instructor"],
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "Extras",
|
||||
items: [
|
||||
"extras/contributing",
|
||||
"contributing",
|
||||
"rules",
|
||||
"proxy_server",
|
||||
{
|
||||
type: "category",
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -93,7 +93,7 @@ class _ENTERPRISE_BannedKeywords(CustomLogger):
|
|||
response.choices[0], litellm.utils.Choices
|
||||
):
|
||||
for word in self.banned_keywords_list:
|
||||
self.test_violation(test_str=response.choices[0].message.content)
|
||||
self.test_violation(test_str=response.choices[0].message.content or "")
|
||||
|
||||
async def async_post_call_streaming_hook(
|
||||
self,
|
||||
|
|
|
@ -122,236 +122,6 @@ async def ui_get_spend_by_tags(
|
|||
return {"spend_per_tag": ui_tags}
|
||||
|
||||
|
||||
async def view_spend_logs_from_clickhouse(
|
||||
api_key=None, user_id=None, request_id=None, start_date=None, end_date=None
|
||||
):
|
||||
verbose_logger.debug("Reading logs from Clickhouse")
|
||||
import os
|
||||
|
||||
# if user has setup clickhouse
|
||||
# TODO: Move this to be a helper function
|
||||
# querying clickhouse for this data
|
||||
import clickhouse_connect
|
||||
from datetime import datetime
|
||||
|
||||
port = os.getenv("CLICKHOUSE_PORT")
|
||||
if port is not None and isinstance(port, str):
|
||||
port = int(port)
|
||||
|
||||
client = clickhouse_connect.get_client(
|
||||
host=os.getenv("CLICKHOUSE_HOST"),
|
||||
port=port,
|
||||
username=os.getenv("CLICKHOUSE_USERNAME", ""),
|
||||
password=os.getenv("CLICKHOUSE_PASSWORD", ""),
|
||||
)
|
||||
if (
|
||||
start_date is not None
|
||||
and isinstance(start_date, str)
|
||||
and end_date is not None
|
||||
and isinstance(end_date, str)
|
||||
):
|
||||
# Convert the date strings to datetime objects
|
||||
start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
|
||||
end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
|
||||
|
||||
# get top spend per day
|
||||
response = client.query(
|
||||
f"""
|
||||
SELECT
|
||||
toDate(startTime) AS day,
|
||||
sum(spend) AS total_spend
|
||||
FROM
|
||||
spend_logs
|
||||
WHERE
|
||||
toDate(startTime) BETWEEN toDate('2024-02-01') AND toDate('2024-02-29')
|
||||
GROUP BY
|
||||
day
|
||||
ORDER BY
|
||||
total_spend
|
||||
"""
|
||||
)
|
||||
|
||||
results = []
|
||||
result_rows = list(response.result_rows)
|
||||
for response in result_rows:
|
||||
current_row = {}
|
||||
current_row["users"] = {"example": 0.0}
|
||||
current_row["models"] = {}
|
||||
|
||||
current_row["spend"] = float(response[1])
|
||||
current_row["startTime"] = str(response[0])
|
||||
|
||||
# stubbed api_key
|
||||
current_row[""] = 0.0 # type: ignore
|
||||
results.append(current_row)
|
||||
|
||||
return results
|
||||
else:
|
||||
# check if spend logs exist, if it does then return last 10 logs, sorted in descending order of startTime
|
||||
response = client.query(
|
||||
"""
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
default.spend_logs
|
||||
ORDER BY
|
||||
startTime DESC
|
||||
LIMIT
|
||||
10
|
||||
"""
|
||||
)
|
||||
|
||||
# get size of spend logs
|
||||
num_rows = client.query("SELECT count(*) FROM default.spend_logs")
|
||||
num_rows = num_rows.result_rows[0][0]
|
||||
|
||||
# safely access num_rows.result_rows[0][0]
|
||||
if num_rows is None:
|
||||
num_rows = 0
|
||||
|
||||
raw_rows = list(response.result_rows)
|
||||
response_data = {
|
||||
"logs": raw_rows,
|
||||
"log_count": num_rows,
|
||||
}
|
||||
return response_data
|
||||
|
||||
|
||||
def _create_clickhouse_material_views(client=None, table_names=[]):
|
||||
# Create Materialized Views if they don't exist
|
||||
# Materialized Views send new inserted rows to the aggregate tables
|
||||
|
||||
verbose_logger.debug("Clickhouse: Creating Materialized Views")
|
||||
if "daily_aggregated_spend_per_model_mv" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model_mv")
|
||||
client.command(
|
||||
"""
|
||||
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_model_mv
|
||||
TO daily_aggregated_spend_per_model
|
||||
AS
|
||||
SELECT
|
||||
toDate(startTime) as day,
|
||||
sumState(spend) AS DailySpend,
|
||||
model as model
|
||||
FROM spend_logs
|
||||
GROUP BY
|
||||
day, model
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend_per_api_key_mv" not in table_names:
|
||||
verbose_logger.debug(
|
||||
"Clickhouse: Creating daily_aggregated_spend_per_api_key_mv"
|
||||
)
|
||||
client.command(
|
||||
"""
|
||||
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_api_key_mv
|
||||
TO daily_aggregated_spend_per_api_key
|
||||
AS
|
||||
SELECT
|
||||
toDate(startTime) as day,
|
||||
sumState(spend) AS DailySpend,
|
||||
api_key as api_key
|
||||
FROM spend_logs
|
||||
GROUP BY
|
||||
day, api_key
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend_per_user_mv" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user_mv")
|
||||
client.command(
|
||||
"""
|
||||
CREATE MATERIALIZED VIEW daily_aggregated_spend_per_user_mv
|
||||
TO daily_aggregated_spend_per_user
|
||||
AS
|
||||
SELECT
|
||||
toDate(startTime) as day,
|
||||
sumState(spend) AS DailySpend,
|
||||
user as user
|
||||
FROM spend_logs
|
||||
GROUP BY
|
||||
day, user
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend_mv" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_mv")
|
||||
client.command(
|
||||
"""
|
||||
CREATE MATERIALIZED VIEW daily_aggregated_spend_mv
|
||||
TO daily_aggregated_spend
|
||||
AS
|
||||
SELECT
|
||||
toDate(startTime) as day,
|
||||
sumState(spend) AS DailySpend
|
||||
FROM spend_logs
|
||||
GROUP BY
|
||||
day
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
|
||||
# Basic Logging works without this - this is only used for low latency reporting apis
|
||||
verbose_logger.debug("Clickhouse: Creating Aggregate Tables")
|
||||
|
||||
# Create Aggregeate Tables if they don't exist
|
||||
if "daily_aggregated_spend_per_model" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model")
|
||||
client.command(
|
||||
"""
|
||||
CREATE TABLE daily_aggregated_spend_per_model
|
||||
(
|
||||
`day` Date,
|
||||
`DailySpend` AggregateFunction(sum, Float64),
|
||||
`model` String
|
||||
)
|
||||
ENGINE = SummingMergeTree()
|
||||
ORDER BY (day, model);
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend_per_api_key" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_api_key")
|
||||
client.command(
|
||||
"""
|
||||
CREATE TABLE daily_aggregated_spend_per_api_key
|
||||
(
|
||||
`day` Date,
|
||||
`DailySpend` AggregateFunction(sum, Float64),
|
||||
`api_key` String
|
||||
)
|
||||
ENGINE = SummingMergeTree()
|
||||
ORDER BY (day, api_key);
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend_per_user" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user")
|
||||
client.command(
|
||||
"""
|
||||
CREATE TABLE daily_aggregated_spend_per_user
|
||||
(
|
||||
`day` Date,
|
||||
`DailySpend` AggregateFunction(sum, Float64),
|
||||
`user` String
|
||||
)
|
||||
ENGINE = SummingMergeTree()
|
||||
ORDER BY (day, user);
|
||||
"""
|
||||
)
|
||||
if "daily_aggregated_spend" not in table_names:
|
||||
verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend")
|
||||
client.command(
|
||||
"""
|
||||
CREATE TABLE daily_aggregated_spend
|
||||
(
|
||||
`day` Date,
|
||||
`DailySpend` AggregateFunction(sum, Float64),
|
||||
)
|
||||
ENGINE = SummingMergeTree()
|
||||
ORDER BY (day);
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def _forecast_daily_cost(data: list):
|
||||
import requests # type: ignore
|
||||
from datetime import datetime, timedelta
|
||||
|
|
|
@ -13,7 +13,10 @@ from litellm._logging import (
|
|||
verbose_logger,
|
||||
json_logs,
|
||||
_turn_on_json,
|
||||
log_level,
|
||||
)
|
||||
|
||||
|
||||
from litellm.proxy._types import (
|
||||
KeyManagementSystem,
|
||||
KeyManagementSettings,
|
||||
|
@ -34,7 +37,7 @@ input_callback: List[Union[str, Callable]] = []
|
|||
success_callback: List[Union[str, Callable]] = []
|
||||
failure_callback: List[Union[str, Callable]] = []
|
||||
service_callback: List[Union[str, Callable]] = []
|
||||
_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter"]
|
||||
_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter", "logfire"]
|
||||
callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
|
||||
_langfuse_default_tags: Optional[
|
||||
List[
|
||||
|
@ -73,7 +76,7 @@ token: Optional[str] = (
|
|||
)
|
||||
telemetry = True
|
||||
max_tokens = 256 # OpenAI Defaults
|
||||
drop_params = False
|
||||
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
|
||||
modify_params = False
|
||||
retry = True
|
||||
### AUTH ###
|
||||
|
@ -240,6 +243,7 @@ num_retries: Optional[int] = None # per model endpoint
|
|||
default_fallbacks: Optional[List] = None
|
||||
fallbacks: Optional[List] = None
|
||||
context_window_fallbacks: Optional[List] = None
|
||||
content_policy_fallbacks: Optional[List] = None
|
||||
allowed_fails: int = 0
|
||||
num_retries_per_request: Optional[int] = (
|
||||
None # for the request overall (incl. fallbacks + model retries)
|
||||
|
@ -337,6 +341,7 @@ bedrock_models: List = []
|
|||
deepinfra_models: List = []
|
||||
perplexity_models: List = []
|
||||
watsonx_models: List = []
|
||||
gemini_models: List = []
|
||||
for key, value in model_cost.items():
|
||||
if value.get("litellm_provider") == "openai":
|
||||
open_ai_chat_completion_models.append(key)
|
||||
|
@ -383,13 +388,16 @@ for key, value in model_cost.items():
|
|||
perplexity_models.append(key)
|
||||
elif value.get("litellm_provider") == "watsonx":
|
||||
watsonx_models.append(key)
|
||||
|
||||
elif value.get("litellm_provider") == "gemini":
|
||||
gemini_models.append(key)
|
||||
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
|
||||
openai_compatible_endpoints: List = [
|
||||
"api.perplexity.ai",
|
||||
"api.endpoints.anyscale.com/v1",
|
||||
"api.deepinfra.com/v1/openai",
|
||||
"api.mistral.ai/v1",
|
||||
"codestral.mistral.ai/v1/chat/completions",
|
||||
"codestral.mistral.ai/v1/fim/completions",
|
||||
"api.groq.com/openai/v1",
|
||||
"api.deepseek.com/v1",
|
||||
"api.together.xyz/v1",
|
||||
|
@ -401,6 +409,7 @@ openai_compatible_providers: List = [
|
|||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"codestral",
|
||||
"deepseek",
|
||||
"deepinfra",
|
||||
"perplexity",
|
||||
|
@ -592,6 +601,7 @@ model_list = (
|
|||
+ maritalk_models
|
||||
+ vertex_language_models
|
||||
+ watsonx_models
|
||||
+ gemini_models
|
||||
)
|
||||
|
||||
provider_list: List = [
|
||||
|
@ -607,6 +617,7 @@ provider_list: List = [
|
|||
"together_ai",
|
||||
"openrouter",
|
||||
"vertex_ai",
|
||||
"vertex_ai_beta",
|
||||
"palm",
|
||||
"gemini",
|
||||
"ai21",
|
||||
|
@ -627,6 +638,8 @@ provider_list: List = [
|
|||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"codestral",
|
||||
"text-completion-codestral",
|
||||
"deepseek",
|
||||
"maritalk",
|
||||
"voyage",
|
||||
|
@ -664,6 +677,7 @@ models_by_provider: dict = {
|
|||
"perplexity": perplexity_models,
|
||||
"maritalk": maritalk_models,
|
||||
"watsonx": watsonx_models,
|
||||
"gemini": gemini_models,
|
||||
}
|
||||
|
||||
# mapping for those models which have larger equivalents
|
||||
|
@ -716,6 +730,7 @@ openai_image_generation_models = ["dall-e-2", "dall-e-3"]
|
|||
|
||||
from .timeout import timeout
|
||||
from .cost_calculator import completion_cost
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging
|
||||
from .utils import (
|
||||
client,
|
||||
exception_type,
|
||||
|
@ -724,12 +739,11 @@ from .utils import (
|
|||
token_counter,
|
||||
create_pretrained_tokenizer,
|
||||
create_tokenizer,
|
||||
cost_per_token,
|
||||
supports_function_calling,
|
||||
supports_parallel_function_calling,
|
||||
supports_vision,
|
||||
supports_system_messages,
|
||||
get_litellm_params,
|
||||
Logging,
|
||||
acreate,
|
||||
get_model_list,
|
||||
get_max_tokens,
|
||||
|
@ -749,9 +763,10 @@ from .utils import (
|
|||
get_first_chars_messages,
|
||||
ModelResponse,
|
||||
ImageResponse,
|
||||
ImageObject,
|
||||
get_provider_fields,
|
||||
)
|
||||
|
||||
from .types.utils import ImageObject
|
||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||
from .llms.anthropic import AnthropicConfig
|
||||
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
|
||||
|
@ -768,6 +783,7 @@ from .llms.gemini import GeminiConfig
|
|||
from .llms.nlp_cloud import NLPCloudConfig
|
||||
from .llms.aleph_alpha import AlephAlphaConfig
|
||||
from .llms.petals import PetalsConfig
|
||||
from .llms.vertex_httpx import VertexGeminiConfig
|
||||
from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
|
||||
from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
|
||||
from .llms.sagemaker import SagemakerConfig
|
||||
|
@ -792,7 +808,9 @@ from .llms.openai import (
|
|||
MistralConfig,
|
||||
MistralEmbeddingConfig,
|
||||
DeepInfraConfig,
|
||||
AzureAIStudioConfig,
|
||||
)
|
||||
from .llms.text_completion_codestral import MistralTextCompletionConfig
|
||||
from .llms.azure import (
|
||||
AzureOpenAIConfig,
|
||||
AzureOpenAIError,
|
||||
|
@ -826,4 +844,4 @@ from .router import Router
|
|||
from .assistants.main import *
|
||||
from .batches.main import *
|
||||
from .scheduler import *
|
||||
from .cost_calculator import response_cost_calculator
|
||||
from .cost_calculator import response_cost_calculator, cost_per_token
|
||||
|
|
|
@ -1,21 +1,40 @@
|
|||
import logging, os, json
|
||||
from logging import Formatter
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from logging import Formatter
|
||||
|
||||
set_verbose = False
|
||||
|
||||
if set_verbose is True:
|
||||
logging.warning(
|
||||
"`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."
|
||||
)
|
||||
json_logs = bool(os.getenv("JSON_LOGS", False))
|
||||
# Create a handler for the logger (you may need to adapt this based on your needs)
|
||||
log_level = os.getenv("LITELLM_LOG", "DEBUG")
|
||||
numeric_level: str = getattr(logging, log_level.upper())
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.DEBUG)
|
||||
handler.setLevel(numeric_level)
|
||||
|
||||
|
||||
class JsonFormatter(Formatter):
|
||||
def __init__(self):
|
||||
super(JsonFormatter, self).__init__()
|
||||
|
||||
def formatTime(self, record, datefmt=None):
|
||||
# Use datetime to format the timestamp in ISO 8601 format
|
||||
dt = datetime.fromtimestamp(record.created)
|
||||
return dt.isoformat()
|
||||
|
||||
def format(self, record):
|
||||
json_record = {}
|
||||
json_record["message"] = record.getMessage()
|
||||
json_record = {
|
||||
"message": record.getMessage(),
|
||||
"level": record.levelname,
|
||||
"timestamp": self.formatTime(record),
|
||||
}
|
||||
|
||||
return json.dumps(json_record)
|
||||
|
||||
|
||||
|
|
|
@ -1192,7 +1192,7 @@ class S3Cache(BaseCache):
|
|||
return cached_response
|
||||
except botocore.exceptions.ClientError as e:
|
||||
if e.response["Error"]["Code"] == "NoSuchKey":
|
||||
verbose_logger.error(
|
||||
verbose_logger.debug(
|
||||
f"S3 Cache: The specified key '{key}' does not exist in the S3 bucket."
|
||||
)
|
||||
return None
|
||||
|
|
|
@ -1,21 +1,292 @@
|
|||
# What is this?
|
||||
## File for 'response_cost' calculation in Logging
|
||||
from typing import Optional, Union, Literal, List
|
||||
import time
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import litellm
|
||||
import litellm._logging
|
||||
from litellm import verbose_logger
|
||||
from litellm.litellm_core_utils.llm_cost_calc.google import (
|
||||
cost_per_character as google_cost_per_character,
|
||||
)
|
||||
from litellm.litellm_core_utils.llm_cost_calc.google import (
|
||||
cost_per_token as google_cost_per_token,
|
||||
)
|
||||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
CallTypes,
|
||||
CostPerToken,
|
||||
EmbeddingResponse,
|
||||
ImageResponse,
|
||||
TranscriptionResponse,
|
||||
ModelResponse,
|
||||
TextCompletionResponse,
|
||||
CallTypes,
|
||||
cost_per_token,
|
||||
TranscriptionResponse,
|
||||
print_verbose,
|
||||
CostPerToken,
|
||||
token_counter,
|
||||
)
|
||||
import litellm
|
||||
from litellm import verbose_logger
|
||||
|
||||
|
||||
def _cost_per_token_custom_pricing_helper(
|
||||
prompt_tokens: float = 0,
|
||||
completion_tokens: float = 0,
|
||||
response_time_ms=None,
|
||||
### CUSTOM PRICING ###
|
||||
custom_cost_per_token: Optional[CostPerToken] = None,
|
||||
custom_cost_per_second: Optional[float] = None,
|
||||
) -> Optional[Tuple[float, float]]:
|
||||
"""Internal helper function for calculating cost, if custom pricing given"""
|
||||
if custom_cost_per_token is None and custom_cost_per_second is None:
|
||||
return None
|
||||
|
||||
if custom_cost_per_token is not None:
|
||||
input_cost = custom_cost_per_token["input_cost_per_token"] * prompt_tokens
|
||||
output_cost = custom_cost_per_token["output_cost_per_token"] * completion_tokens
|
||||
return input_cost, output_cost
|
||||
elif custom_cost_per_second is not None:
|
||||
output_cost = custom_cost_per_second * response_time_ms / 1000 # type: ignore
|
||||
return 0, output_cost
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def cost_per_token(
|
||||
model: str = "",
|
||||
prompt_tokens: float = 0,
|
||||
completion_tokens: float = 0,
|
||||
response_time_ms=None,
|
||||
custom_llm_provider: Optional[str] = None,
|
||||
region_name=None,
|
||||
### CHARACTER PRICING ###
|
||||
prompt_characters: float = 0,
|
||||
completion_characters: float = 0,
|
||||
### CUSTOM PRICING ###
|
||||
custom_cost_per_token: Optional[CostPerToken] = None,
|
||||
custom_cost_per_second: Optional[float] = None,
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||
|
||||
Parameters:
|
||||
model (str): The name of the model to use. Default is ""
|
||||
prompt_tokens (int): The number of tokens in the prompt.
|
||||
completion_tokens (int): The number of tokens in the completion.
|
||||
response_time (float): The amount of time, in milliseconds, it took the call to complete.
|
||||
prompt_characters (float): The number of characters in the prompt. Used for vertex ai cost calculation.
|
||||
completion_characters (float): The number of characters in the completion response. Used for vertex ai cost calculation.
|
||||
custom_llm_provider (str): The llm provider to whom the call was made (see init.py for full list)
|
||||
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
|
||||
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
|
||||
"""
|
||||
args = locals()
|
||||
if model is None:
|
||||
raise Exception("Invalid arg. Model cannot be none.")
|
||||
## CUSTOM PRICING ##
|
||||
response_cost = _cost_per_token_custom_pricing_helper(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
response_time_ms=response_time_ms,
|
||||
custom_cost_per_second=custom_cost_per_second,
|
||||
custom_cost_per_token=custom_cost_per_token,
|
||||
)
|
||||
if response_cost is not None:
|
||||
return response_cost[0], response_cost[1]
|
||||
|
||||
# given
|
||||
prompt_tokens_cost_usd_dollar: float = 0
|
||||
completion_tokens_cost_usd_dollar: float = 0
|
||||
model_cost_ref = litellm.model_cost
|
||||
model_with_provider = model
|
||||
if custom_llm_provider is not None:
|
||||
model_with_provider = custom_llm_provider + "/" + model
|
||||
if region_name is not None:
|
||||
model_with_provider_and_region = (
|
||||
f"{custom_llm_provider}/{region_name}/{model}"
|
||||
)
|
||||
if (
|
||||
model_with_provider_and_region in model_cost_ref
|
||||
): # use region based pricing, if it's available
|
||||
model_with_provider = model_with_provider_and_region
|
||||
else:
|
||||
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
|
||||
model_without_prefix = model
|
||||
model_parts = model.split("/")
|
||||
if len(model_parts) > 1:
|
||||
model_without_prefix = model_parts[1]
|
||||
else:
|
||||
model_without_prefix = model
|
||||
"""
|
||||
Code block that formats model to lookup in litellm.model_cost
|
||||
Option1. model = "bedrock/ap-northeast-1/anthropic.claude-instant-v1". This is the most accurate since it is region based. Should always be option 1
|
||||
Option2. model = "openai/gpt-4" - model = provider/model
|
||||
Option3. model = "anthropic.claude-3" - model = model
|
||||
"""
|
||||
if (
|
||||
model_with_provider in model_cost_ref
|
||||
): # Option 2. use model with provider, model = "openai/gpt-4"
|
||||
model = model_with_provider
|
||||
elif model in model_cost_ref: # Option 1. use model passed, model="gpt-4"
|
||||
model = model
|
||||
elif (
|
||||
model_without_prefix in model_cost_ref
|
||||
): # Option 3. if user passed model="bedrock/anthropic.claude-3", use model="anthropic.claude-3"
|
||||
model = model_without_prefix
|
||||
|
||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||
if custom_llm_provider == "vertex_ai":
|
||||
return google_cost_per_character(
|
||||
model=model_without_prefix,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_characters=prompt_characters,
|
||||
completion_characters=completion_characters,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
elif custom_llm_provider == "gemini":
|
||||
return google_cost_per_token(
|
||||
model=model_without_prefix,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
elif model in model_cost_ref:
|
||||
print_verbose(f"Success: model={model} in model_cost_map")
|
||||
print_verbose(
|
||||
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
|
||||
)
|
||||
if (
|
||||
model_cost_ref[model].get("input_cost_per_token", None) is not None
|
||||
and model_cost_ref[model].get("output_cost_per_token", None) is not None
|
||||
):
|
||||
## COST PER TOKEN ##
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
||||
)
|
||||
elif (
|
||||
model_cost_ref[model].get("output_cost_per_second", None) is not None
|
||||
and response_time_ms is not None
|
||||
):
|
||||
print_verbose(
|
||||
f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}"
|
||||
)
|
||||
## COST PER SECOND ##
|
||||
prompt_tokens_cost_usd_dollar = 0
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["output_cost_per_second"]
|
||||
* response_time_ms
|
||||
/ 1000
|
||||
)
|
||||
elif (
|
||||
model_cost_ref[model].get("input_cost_per_second", None) is not None
|
||||
and response_time_ms is not None
|
||||
):
|
||||
print_verbose(
|
||||
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
|
||||
)
|
||||
## COST PER SECOND ##
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = 0.0
|
||||
print_verbose(
|
||||
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif "ft:gpt-3.5-turbo" in model:
|
||||
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"]
|
||||
* completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif "ft:gpt-4-0613" in model:
|
||||
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||
# fuzzy match ft:gpt-4-0613:abcd-id-cool-litellm
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif "ft:gpt-4o-2024-05-13" in model:
|
||||
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||
# fuzzy match ft:gpt-4o-2024-05-13:abcd-id-cool-litellm
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"]
|
||||
* prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"]
|
||||
* completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
|
||||
elif "ft:davinci-002" in model:
|
||||
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||
# fuzzy match ft:davinci-002:abcd-id-cool-litellm
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:davinci-002"]["output_cost_per_token"]
|
||||
* completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif "ft:babbage-002" in model:
|
||||
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||
# fuzzy match ft:babbage-002:abcd-id-cool-litellm
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref["ft:babbage-002"]["output_cost_per_token"]
|
||||
* completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif model in litellm.azure_llms:
|
||||
verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
|
||||
model = litellm.azure_llms[model]
|
||||
verbose_logger.debug(
|
||||
f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
|
||||
)
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
verbose_logger.debug(
|
||||
f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
elif model in litellm.azure_embedding_models:
|
||||
verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
|
||||
model = litellm.azure_embedding_models[model]
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
else:
|
||||
# if model is not in model_prices_and_context_window.json. Raise an exception-let users know
|
||||
error_str = f"Model not in model_prices_and_context_window.json. You passed model={model}. Register pricing for model - https://docs.litellm.ai/docs/proxy/custom_pricing\n"
|
||||
raise litellm.exceptions.NotFoundError( # type: ignore
|
||||
message=error_str,
|
||||
model=model,
|
||||
llm_provider="",
|
||||
)
|
||||
|
||||
|
||||
# Extract the number of billion parameters from the model name
|
||||
|
@ -147,7 +418,9 @@ def completion_cost(
|
|||
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
|
||||
# Handle Inputs to completion_cost
|
||||
prompt_tokens = 0
|
||||
prompt_characters = 0
|
||||
completion_tokens = 0
|
||||
completion_characters = 0
|
||||
custom_llm_provider = None
|
||||
if completion_response is not None:
|
||||
# get input/output tokens from completion_response
|
||||
|
@ -264,6 +537,30 @@ def completion_cost(
|
|||
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
|
||||
)
|
||||
|
||||
if (
|
||||
custom_llm_provider is not None
|
||||
and custom_llm_provider == "vertex_ai"
|
||||
and completion_response is not None
|
||||
and isinstance(completion_response, ModelResponse)
|
||||
):
|
||||
# Calculate the prompt characters + response characters
|
||||
if len("messages") > 0:
|
||||
prompt_string = litellm.utils.get_formatted_prompt(
|
||||
data={"messages": messages}, call_type="completion"
|
||||
)
|
||||
else:
|
||||
prompt_string = ""
|
||||
|
||||
prompt_characters = litellm.utils._count_characters(text=prompt_string)
|
||||
|
||||
completion_string = litellm.utils.get_response_string(
|
||||
response_obj=completion_response
|
||||
)
|
||||
|
||||
completion_characters = litellm.utils._count_characters(
|
||||
text=completion_string
|
||||
)
|
||||
|
||||
(
|
||||
prompt_tokens_cost_usd_dollar,
|
||||
completion_tokens_cost_usd_dollar,
|
||||
|
@ -276,6 +573,8 @@ def completion_cost(
|
|||
region_name=region_name,
|
||||
custom_cost_per_second=custom_cost_per_second,
|
||||
custom_cost_per_token=custom_cost_per_token,
|
||||
prompt_characters=prompt_characters,
|
||||
completion_characters=completion_characters,
|
||||
)
|
||||
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
print_verbose(
|
||||
|
|
|
@ -26,7 +26,7 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 401
|
||||
self.message = message
|
||||
self.message = "litellm.AuthenticationError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -72,7 +72,7 @@ class NotFoundError(openai.NotFoundError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 404
|
||||
self.message = message
|
||||
self.message = "litellm.NotFoundError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -117,7 +117,7 @@ class BadRequestError(openai.BadRequestError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 400
|
||||
self.message = message
|
||||
self.message = "litellm.BadRequestError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -162,7 +162,7 @@ class UnprocessableEntityError(openai.UnprocessableEntityError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 422
|
||||
self.message = message
|
||||
self.message = "litellm.UnprocessableEntityError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -204,7 +204,7 @@ class Timeout(openai.APITimeoutError): # type: ignore
|
|||
request=request
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
self.status_code = 408
|
||||
self.message = message
|
||||
self.message = "litellm.Timeout: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -241,7 +241,7 @@ class PermissionDeniedError(openai.PermissionDeniedError): # type:ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 403
|
||||
self.message = message
|
||||
self.message = "litellm.PermissionDeniedError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -280,7 +280,7 @@ class RateLimitError(openai.RateLimitError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 429
|
||||
self.message = message
|
||||
self.message = "litellm.RateLimitError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -324,19 +324,21 @@ class ContextWindowExceededError(BadRequestError): # type: ignore
|
|||
message,
|
||||
model,
|
||||
llm_provider,
|
||||
response: httpx.Response,
|
||||
response: Optional[httpx.Response] = None,
|
||||
litellm_debug_info: Optional[str] = None,
|
||||
):
|
||||
self.status_code = 400
|
||||
self.message = message
|
||||
self.message = "litellm.ContextWindowExceededError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
|
||||
self.response = response or httpx.Response(status_code=400, request=request)
|
||||
super().__init__(
|
||||
message=self.message,
|
||||
model=self.model, # type: ignore
|
||||
llm_provider=self.llm_provider, # type: ignore
|
||||
response=response,
|
||||
response=self.response,
|
||||
litellm_debug_info=self.litellm_debug_info,
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
@ -368,7 +370,7 @@ class RejectedRequestError(BadRequestError): # type: ignore
|
|||
litellm_debug_info: Optional[str] = None,
|
||||
):
|
||||
self.status_code = 400
|
||||
self.message = message
|
||||
self.message = "litellm.RejectedRequestError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -407,19 +409,21 @@ class ContentPolicyViolationError(BadRequestError): # type: ignore
|
|||
message,
|
||||
model,
|
||||
llm_provider,
|
||||
response: httpx.Response,
|
||||
response: Optional[httpx.Response] = None,
|
||||
litellm_debug_info: Optional[str] = None,
|
||||
):
|
||||
self.status_code = 400
|
||||
self.message = message
|
||||
self.message = "litellm.ContentPolicyViolationError: {}".format(message)
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
|
||||
self.response = response or httpx.Response(status_code=500, request=request)
|
||||
super().__init__(
|
||||
message=self.message,
|
||||
model=self.model, # type: ignore
|
||||
llm_provider=self.llm_provider, # type: ignore
|
||||
response=response,
|
||||
response=self.response,
|
||||
litellm_debug_info=self.litellm_debug_info,
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
@ -452,7 +456,7 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 503
|
||||
self.message = message
|
||||
self.message = "litellm.ServiceUnavailableError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -501,7 +505,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = 500
|
||||
self.message = message
|
||||
self.message = "litellm.InternalServerError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -552,7 +556,7 @@ class APIError(openai.APIError): # type: ignore
|
|||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
self.message = "litellm.APIError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.litellm_debug_info = litellm_debug_info
|
||||
|
@ -589,7 +593,7 @@ class APIConnectionError(openai.APIConnectionError): # type: ignore
|
|||
max_retries: Optional[int] = None,
|
||||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.message = message
|
||||
self.message = "litellm.APIConnectionError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
self.status_code = 500
|
||||
|
@ -626,7 +630,7 @@ class APIResponseValidationError(openai.APIResponseValidationError): # type: ig
|
|||
max_retries: Optional[int] = None,
|
||||
num_retries: Optional[int] = None,
|
||||
):
|
||||
self.message = message
|
||||
self.message = "litellm.APIResponseValidationError: {}".format(message)
|
||||
self.llm_provider = llm_provider
|
||||
self.model = model
|
||||
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
|
||||
|
|
|
@ -226,14 +226,6 @@ def _start_clickhouse():
|
|||
response = client.query("DESCRIBE default.spend_logs")
|
||||
verbose_logger.debug(f"spend logs schema ={response.result_rows}")
|
||||
|
||||
# RUN Enterprise Clickhouse Setup
|
||||
# TLDR: For Enterprise - we create views / aggregate tables for low latency reporting APIs
|
||||
from litellm.proxy.enterprise.utils import _create_clickhouse_aggregate_tables
|
||||
from litellm.proxy.enterprise.utils import _create_clickhouse_material_views
|
||||
|
||||
_create_clickhouse_aggregate_tables(client=client, table_names=table_names)
|
||||
_create_clickhouse_material_views(client=client, table_names=table_names)
|
||||
|
||||
|
||||
class ClickhouseLogger:
|
||||
# Class variables or attributes
|
||||
|
|
|
@ -10,7 +10,7 @@ import traceback
|
|||
|
||||
class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def log_pre_api_call(self, model, messages, kwargs):
|
||||
|
|
136
litellm/integrations/email_alerting.py
Normal file
136
litellm/integrations/email_alerting.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
"""
|
||||
Functions for sending Email Alerts
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, List
|
||||
from litellm.proxy._types import WebhookEvent
|
||||
import asyncio
|
||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||
|
||||
# we use this for the email header, please send a test email if you change this. verify it looks good on email
|
||||
LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
|
||||
LITELLM_SUPPORT_CONTACT = "support@berri.ai"
|
||||
|
||||
|
||||
async def get_all_team_member_emails(team_id: Optional[str] = None) -> list:
|
||||
verbose_logger.debug(
|
||||
"Email Alerting: Getting all team members for team_id=%s", team_id
|
||||
)
|
||||
if team_id is None:
|
||||
return []
|
||||
from litellm.proxy.proxy_server import premium_user, prisma_client
|
||||
|
||||
if prisma_client is None:
|
||||
raise Exception("Not connected to DB!")
|
||||
|
||||
team_row = await prisma_client.db.litellm_teamtable.find_unique(
|
||||
where={
|
||||
"team_id": team_id,
|
||||
}
|
||||
)
|
||||
|
||||
if team_row is None:
|
||||
return []
|
||||
|
||||
_team_members = team_row.members_with_roles
|
||||
verbose_logger.debug(
|
||||
"Email Alerting: Got team members for team_id=%s Team Members: %s",
|
||||
team_id,
|
||||
_team_members,
|
||||
)
|
||||
_team_member_user_ids: List[str] = []
|
||||
for member in _team_members:
|
||||
if member and isinstance(member, dict) and member.get("user_id") is not None:
|
||||
_team_member_user_ids.append(member.get("user_id"))
|
||||
|
||||
sql_query = """
|
||||
SELECT user_email
|
||||
FROM "LiteLLM_UserTable"
|
||||
WHERE user_id = ANY($1::TEXT[]);
|
||||
"""
|
||||
|
||||
_result = await prisma_client.db.query_raw(sql_query, _team_member_user_ids)
|
||||
|
||||
verbose_logger.debug("Email Alerting: Got all Emails for team, emails=%s", _result)
|
||||
|
||||
if _result is None:
|
||||
return []
|
||||
|
||||
emails = []
|
||||
for user in _result:
|
||||
if user and isinstance(user, dict) and user.get("user_email", None) is not None:
|
||||
emails.append(user.get("user_email"))
|
||||
return emails
|
||||
|
||||
|
||||
async def send_team_budget_alert(webhook_event: WebhookEvent) -> bool:
|
||||
"""
|
||||
Send an Email Alert to All Team Members when the Team Budget is crossed
|
||||
Returns -> True if sent, False if not.
|
||||
"""
|
||||
from litellm.proxy.utils import send_email
|
||||
|
||||
from litellm.proxy.proxy_server import premium_user, prisma_client
|
||||
|
||||
_team_id = webhook_event.team_id
|
||||
team_alias = webhook_event.team_alias
|
||||
verbose_logger.debug(
|
||||
"Email Alerting: Sending Team Budget Alert for team=%s", team_alias
|
||||
)
|
||||
|
||||
email_logo_url = os.getenv("SMTP_SENDER_LOGO", os.getenv("EMAIL_LOGO_URL", None))
|
||||
email_support_contact = os.getenv("EMAIL_SUPPORT_CONTACT", None)
|
||||
|
||||
# await self._check_if_using_premium_email_feature(
|
||||
# premium_user, email_logo_url, email_support_contact
|
||||
# )
|
||||
|
||||
if email_logo_url is None:
|
||||
email_logo_url = LITELLM_LOGO_URL
|
||||
if email_support_contact is None:
|
||||
email_support_contact = LITELLM_SUPPORT_CONTACT
|
||||
recipient_emails = await get_all_team_member_emails(_team_id)
|
||||
recipient_emails_str: str = ",".join(recipient_emails)
|
||||
verbose_logger.debug(
|
||||
"Email Alerting: Sending team budget alert to %s", recipient_emails_str
|
||||
)
|
||||
|
||||
event_name = webhook_event.event_message
|
||||
max_budget = webhook_event.max_budget
|
||||
email_html_content = "Alert from LiteLLM Server"
|
||||
|
||||
if recipient_emails_str is None:
|
||||
verbose_proxy_logger.error(
|
||||
"Email Alerting: Trying to send email alert to no recipient, got recipient_emails=%s",
|
||||
recipient_emails_str,
|
||||
)
|
||||
|
||||
email_html_content = f"""
|
||||
<img src="{email_logo_url}" alt="LiteLLM Logo" width="150" height="50" /> <br/><br/><br/>
|
||||
|
||||
Budget Crossed for Team <b> {team_alias} </b> <br/> <br/>
|
||||
|
||||
Your Teams LLM API usage has crossed it's <b> budget of ${max_budget} </b>, current spend is <b>${webhook_event.spend}</b><br /> <br />
|
||||
|
||||
API requests will be rejected until either (a) you increase your budget or (b) your budget gets reset <br /> <br />
|
||||
|
||||
If you have any questions, please send an email to {email_support_contact} <br /> <br />
|
||||
|
||||
Best, <br />
|
||||
The LiteLLM team <br />
|
||||
"""
|
||||
|
||||
email_event = {
|
||||
"to": recipient_emails_str,
|
||||
"subject": f"LiteLLM {event_name} for Team {team_alias}",
|
||||
"html": email_html_content,
|
||||
}
|
||||
|
||||
await send_email(
|
||||
receiver_email=email_event["to"],
|
||||
subject=email_event["subject"],
|
||||
html=email_event["html"],
|
||||
)
|
||||
|
||||
return False
|
|
@ -1,13 +1,19 @@
|
|||
# What is this?
|
||||
## On Success events log cost to Lago - https://github.com/BerriAI/litellm/issues/3639
|
||||
|
||||
import dotenv, os, json
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
import uuid
|
||||
from typing import Literal, Optional
|
||||
|
||||
import dotenv
|
||||
import httpx
|
||||
|
||||
import litellm
|
||||
import traceback, httpx
|
||||
from litellm import verbose_logger
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
import uuid
|
||||
from typing import Optional, Literal
|
||||
|
||||
|
||||
def get_utc_datetime():
|
||||
|
@ -143,6 +149,7 @@ class LagoLogger(CustomLogger):
|
|||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
verbose_logger.debug("ENTERS LAGO CALLBACK")
|
||||
_url = os.getenv("LAGO_API_BASE")
|
||||
assert _url is not None and isinstance(
|
||||
_url, str
|
||||
|
|
|
@ -1,21 +1,27 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Langfuse
|
||||
import os
|
||||
import copy
|
||||
import os
|
||||
import traceback
|
||||
|
||||
from packaging.version import Version
|
||||
from litellm._logging import verbose_logger
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
|
||||
|
||||
class LangFuseLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(
|
||||
self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
|
||||
self,
|
||||
langfuse_public_key=None,
|
||||
langfuse_secret=None,
|
||||
langfuse_host=None,
|
||||
flush_interval=1,
|
||||
):
|
||||
try:
|
||||
from langfuse import Langfuse
|
||||
import langfuse
|
||||
from langfuse import Langfuse
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
|
||||
|
@ -23,7 +29,9 @@ class LangFuseLogger:
|
|||
# Instance variables
|
||||
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
|
||||
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
|
||||
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
||||
self.langfuse_host = langfuse_host or os.getenv(
|
||||
"LANGFUSE_HOST", "https://cloud.langfuse.com"
|
||||
)
|
||||
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
|
||||
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
|
||||
|
||||
|
@ -167,7 +175,7 @@ class LangFuseLogger:
|
|||
or isinstance(response_obj, litellm.EmbeddingResponse)
|
||||
):
|
||||
input = prompt
|
||||
output = response_obj["data"]
|
||||
output = None
|
||||
elif response_obj is not None and isinstance(
|
||||
response_obj, litellm.ModelResponse
|
||||
):
|
||||
|
@ -251,7 +259,7 @@ class LangFuseLogger:
|
|||
input,
|
||||
response_obj,
|
||||
):
|
||||
from langfuse.model import CreateTrace, CreateGeneration
|
||||
from langfuse.model import CreateGeneration, CreateTrace
|
||||
|
||||
verbose_logger.warning(
|
||||
"Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
|
||||
|
@ -528,31 +536,14 @@ class LangFuseLogger:
|
|||
"version": clean_metadata.pop("version", None),
|
||||
}
|
||||
|
||||
parent_observation_id = metadata.get("parent_observation_id", None)
|
||||
if parent_observation_id is not None:
|
||||
generation_params["parent_observation_id"] = parent_observation_id
|
||||
|
||||
if supports_prompt:
|
||||
user_prompt = clean_metadata.pop("prompt", None)
|
||||
if user_prompt is None:
|
||||
pass
|
||||
elif isinstance(user_prompt, dict):
|
||||
from langfuse.model import (
|
||||
TextPromptClient,
|
||||
ChatPromptClient,
|
||||
Prompt_Text,
|
||||
Prompt_Chat,
|
||||
)
|
||||
|
||||
if user_prompt.get("type", "") == "chat":
|
||||
_prompt_chat = Prompt_Chat(**user_prompt)
|
||||
generation_params["prompt"] = ChatPromptClient(
|
||||
prompt=_prompt_chat
|
||||
)
|
||||
elif user_prompt.get("type", "") == "text":
|
||||
_prompt_text = Prompt_Text(**user_prompt)
|
||||
generation_params["prompt"] = TextPromptClient(
|
||||
prompt=_prompt_text
|
||||
)
|
||||
else:
|
||||
generation_params["prompt"] = user_prompt
|
||||
|
||||
generation_params = _add_prompt_to_generation_params(
|
||||
generation_params=generation_params, clean_metadata=clean_metadata
|
||||
)
|
||||
if output is not None and isinstance(output, str) and level == "ERROR":
|
||||
generation_params["status_message"] = output
|
||||
|
||||
|
@ -565,5 +556,58 @@ class LangFuseLogger:
|
|||
|
||||
return generation_client.trace_id, generation_id
|
||||
except Exception as e:
|
||||
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
verbose_logger.error(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
return None, None
|
||||
|
||||
|
||||
def _add_prompt_to_generation_params(
|
||||
generation_params: dict, clean_metadata: dict
|
||||
) -> dict:
|
||||
from langfuse.model import (
|
||||
ChatPromptClient,
|
||||
Prompt_Chat,
|
||||
Prompt_Text,
|
||||
TextPromptClient,
|
||||
)
|
||||
|
||||
user_prompt = clean_metadata.pop("prompt", None)
|
||||
if user_prompt is None:
|
||||
pass
|
||||
elif isinstance(user_prompt, dict):
|
||||
if user_prompt.get("type", "") == "chat":
|
||||
_prompt_chat = Prompt_Chat(**user_prompt)
|
||||
generation_params["prompt"] = ChatPromptClient(prompt=_prompt_chat)
|
||||
elif user_prompt.get("type", "") == "text":
|
||||
_prompt_text = Prompt_Text(**user_prompt)
|
||||
generation_params["prompt"] = TextPromptClient(prompt=_prompt_text)
|
||||
elif "version" in user_prompt and "prompt" in user_prompt:
|
||||
# prompts
|
||||
if isinstance(user_prompt["prompt"], str):
|
||||
_prompt_obj = Prompt_Text(
|
||||
name=user_prompt["name"],
|
||||
prompt=user_prompt["prompt"],
|
||||
version=user_prompt["version"],
|
||||
config=user_prompt.get("config", None),
|
||||
)
|
||||
generation_params["prompt"] = TextPromptClient(prompt=_prompt_obj)
|
||||
|
||||
elif isinstance(user_prompt["prompt"], list):
|
||||
_prompt_obj = Prompt_Chat(
|
||||
name=user_prompt["name"],
|
||||
prompt=user_prompt["prompt"],
|
||||
version=user_prompt["version"],
|
||||
config=user_prompt.get("config", None),
|
||||
)
|
||||
generation_params["prompt"] = ChatPromptClient(prompt=_prompt_obj)
|
||||
else:
|
||||
verbose_logger.error(
|
||||
"[Non-blocking] Langfuse Logger: Invalid prompt format"
|
||||
)
|
||||
else:
|
||||
verbose_logger.error(
|
||||
"[Non-blocking] Langfuse Logger: Invalid prompt format. No prompt logged to Langfuse"
|
||||
)
|
||||
else:
|
||||
generation_params["prompt"] = user_prompt
|
||||
|
||||
return generation_params
|
||||
|
|
|
@ -105,7 +105,6 @@ class LunaryLogger:
|
|||
end_time=datetime.now(timezone.utc),
|
||||
error=None,
|
||||
):
|
||||
# Method definition
|
||||
try:
|
||||
print_verbose(f"Lunary Logging - Logging request for model {model}")
|
||||
|
||||
|
@ -114,10 +113,9 @@ class LunaryLogger:
|
|||
metadata = litellm_params.get("metadata", {}) or {}
|
||||
|
||||
if optional_params:
|
||||
# merge into extra
|
||||
extra = {**extra, **optional_params}
|
||||
|
||||
tags = litellm_params.pop("tags", None) or []
|
||||
tags = metadata.get("tags", None)
|
||||
|
||||
if extra:
|
||||
extra.pop("extra_body", None)
|
||||
|
|
|
@ -1,20 +1,21 @@
|
|||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import litellm
|
||||
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.types.services import ServiceLoggerPayload
|
||||
from functools import wraps
|
||||
from typing import Union, Optional, TYPE_CHECKING, Any
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.types.services import ServiceLoggerPayload
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from opentelemetry.trace import Span as _Span
|
||||
from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
|
||||
|
||||
from litellm.proxy._types import (
|
||||
ManagementEndpointLoggingPayload as _ManagementEndpointLoggingPayload,
|
||||
)
|
||||
from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
|
||||
|
||||
Span = _Span
|
||||
UserAPIKeyAuth = _UserAPIKeyAuth
|
||||
|
@ -107,8 +108,9 @@ class OpenTelemetry(CustomLogger):
|
|||
start_time: Optional[datetime] = None,
|
||||
end_time: Optional[datetime] = None,
|
||||
):
|
||||
from opentelemetry import trace
|
||||
from datetime import datetime
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
_start_time_ns = start_time
|
||||
|
@ -145,8 +147,9 @@ class OpenTelemetry(CustomLogger):
|
|||
start_time: Optional[datetime] = None,
|
||||
end_time: Optional[datetime] = None,
|
||||
):
|
||||
from opentelemetry import trace
|
||||
from datetime import datetime
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
_start_time_ns = start_time
|
||||
|
@ -179,8 +182,8 @@ class OpenTelemetry(CustomLogger):
|
|||
async def async_post_call_failure_hook(
|
||||
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
|
||||
):
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
parent_otel_span = user_api_key_dict.parent_otel_span
|
||||
if parent_otel_span is not None:
|
||||
|
@ -202,8 +205,8 @@ class OpenTelemetry(CustomLogger):
|
|||
parent_otel_span.end(end_time=self._to_ns(datetime.now()))
|
||||
|
||||
def _handle_sucess(self, kwargs, response_obj, start_time, end_time):
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
verbose_logger.debug(
|
||||
"OpenTelemetry Logger: Logging kwargs: %s, OTEL config settings=%s",
|
||||
|
@ -253,9 +256,10 @@ class OpenTelemetry(CustomLogger):
|
|||
span.end(end_time=self._to_ns(end_time))
|
||||
|
||||
def set_tools_attributes(self, span: Span, tools):
|
||||
from litellm.proxy._types import SpanAttributes
|
||||
import json
|
||||
|
||||
from litellm.proxy._types import SpanAttributes
|
||||
|
||||
if not tools:
|
||||
return
|
||||
|
||||
|
@ -320,7 +324,7 @@ class OpenTelemetry(CustomLogger):
|
|||
)
|
||||
|
||||
span.set_attribute(
|
||||
SpanAttributes.LLM_IS_STREAMING, optional_params.get("stream", False)
|
||||
SpanAttributes.LLM_IS_STREAMING, str(optional_params.get("stream", False))
|
||||
)
|
||||
|
||||
if optional_params.get("tools"):
|
||||
|
@ -439,7 +443,7 @@ class OpenTelemetry(CustomLogger):
|
|||
#############################################
|
||||
########## LLM Response Attributes ##########
|
||||
#############################################
|
||||
if _raw_response:
|
||||
if _raw_response and isinstance(_raw_response, str):
|
||||
# cast sr -> dict
|
||||
import json
|
||||
|
||||
|
@ -478,10 +482,10 @@ class OpenTelemetry(CustomLogger):
|
|||
return _parent_context
|
||||
|
||||
def _get_span_context(self, kwargs):
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace.propagation.tracecontext import (
|
||||
TraceContextTextMapPropagator,
|
||||
)
|
||||
from opentelemetry import trace
|
||||
|
||||
litellm_params = kwargs.get("litellm_params", {}) or {}
|
||||
proxy_server_request = litellm_params.get("proxy_server_request", {}) or {}
|
||||
|
@ -505,17 +509,17 @@ class OpenTelemetry(CustomLogger):
|
|||
return TraceContextTextMapPropagator().extract(carrier=carrier), None
|
||||
|
||||
def _get_span_processor(self):
|
||||
from opentelemetry.sdk.trace.export import (
|
||||
SpanExporter,
|
||||
SimpleSpanProcessor,
|
||||
BatchSpanProcessor,
|
||||
ConsoleSpanExporter,
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
||||
OTLPSpanExporter as OTLPSpanExporterGRPC,
|
||||
)
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
||||
OTLPSpanExporter as OTLPSpanExporterHTTP,
|
||||
)
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
||||
OTLPSpanExporter as OTLPSpanExporterGRPC,
|
||||
from opentelemetry.sdk.trace.export import (
|
||||
BatchSpanProcessor,
|
||||
ConsoleSpanExporter,
|
||||
SimpleSpanProcessor,
|
||||
SpanExporter,
|
||||
)
|
||||
|
||||
verbose_logger.debug(
|
||||
|
@ -574,8 +578,9 @@ class OpenTelemetry(CustomLogger):
|
|||
logging_payload: ManagementEndpointLoggingPayload,
|
||||
parent_otel_span: Optional[Span] = None,
|
||||
):
|
||||
from opentelemetry import trace
|
||||
from datetime import datetime
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
_start_time_ns = logging_payload.start_time
|
||||
|
@ -619,8 +624,9 @@ class OpenTelemetry(CustomLogger):
|
|||
logging_payload: ManagementEndpointLoggingPayload,
|
||||
parent_otel_span: Optional[Span] = None,
|
||||
):
|
||||
from opentelemetry import trace
|
||||
from datetime import datetime
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
_start_time_ns = logging_payload.start_time
|
||||
|
|
|
@ -8,6 +8,7 @@ import traceback
|
|||
import datetime, subprocess, sys
|
||||
import litellm, uuid
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
class PrometheusLogger:
|
||||
|
@ -17,33 +18,76 @@ class PrometheusLogger:
|
|||
**kwargs,
|
||||
):
|
||||
try:
|
||||
from prometheus_client import Counter
|
||||
from prometheus_client import Counter, Gauge
|
||||
|
||||
self.litellm_llm_api_failed_requests_metric = Counter(
|
||||
name="litellm_llm_api_failed_requests_metric",
|
||||
documentation="Total number of failed LLM API calls via litellm",
|
||||
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
|
||||
labelnames=[
|
||||
"end_user",
|
||||
"hashed_api_key",
|
||||
"model",
|
||||
"team",
|
||||
"team_alias",
|
||||
"user",
|
||||
],
|
||||
)
|
||||
|
||||
self.litellm_requests_metric = Counter(
|
||||
name="litellm_requests_metric",
|
||||
documentation="Total number of LLM calls to litellm",
|
||||
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
|
||||
labelnames=[
|
||||
"end_user",
|
||||
"hashed_api_key",
|
||||
"model",
|
||||
"team",
|
||||
"team_alias",
|
||||
"user",
|
||||
],
|
||||
)
|
||||
|
||||
# Counter for spend
|
||||
self.litellm_spend_metric = Counter(
|
||||
"litellm_spend_metric",
|
||||
"Total spend on LLM requests",
|
||||
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
|
||||
labelnames=[
|
||||
"end_user",
|
||||
"hashed_api_key",
|
||||
"model",
|
||||
"team",
|
||||
"team_alias",
|
||||
"user",
|
||||
],
|
||||
)
|
||||
|
||||
# Counter for total_output_tokens
|
||||
self.litellm_tokens_metric = Counter(
|
||||
"litellm_total_tokens",
|
||||
"Total number of input + output tokens from LLM requests",
|
||||
labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
|
||||
labelnames=[
|
||||
"end_user",
|
||||
"hashed_api_key",
|
||||
"model",
|
||||
"team",
|
||||
"team_alias",
|
||||
"user",
|
||||
],
|
||||
)
|
||||
|
||||
# Remaining Budget for Team
|
||||
self.litellm_remaining_team_budget_metric = Gauge(
|
||||
"litellm_remaining_team_budget_metric",
|
||||
"Remaining budget for team",
|
||||
labelnames=["team_id", "team_alias"],
|
||||
)
|
||||
|
||||
# Remaining Budget for API Key
|
||||
self.litellm_remaining_api_key_budget_metric = Gauge(
|
||||
"litellm_remaining_api_key_budget_metric",
|
||||
"Remaining budget for api key",
|
||||
labelnames=["hashed_api_key", "api_key_alias"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
||||
raise e
|
||||
|
@ -51,7 +95,9 @@ class PrometheusLogger:
|
|||
async def _async_log_event(
|
||||
self, kwargs, response_obj, start_time, end_time, print_verbose, user_id
|
||||
):
|
||||
self.log_event(kwargs, response_obj, start_time, end_time, print_verbose)
|
||||
self.log_event(
|
||||
kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
)
|
||||
|
||||
def log_event(
|
||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
|
@ -72,9 +118,36 @@ class PrometheusLogger:
|
|||
"user_api_key_user_id", None
|
||||
)
|
||||
user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None)
|
||||
user_api_key_alias = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_alias", None
|
||||
)
|
||||
user_api_team = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_team_id", None
|
||||
)
|
||||
user_api_team_alias = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_team_alias", None
|
||||
)
|
||||
|
||||
_team_spend = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_team_spend", None
|
||||
)
|
||||
_team_max_budget = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_team_max_budget", None
|
||||
)
|
||||
_remaining_team_budget = safe_get_remaining_budget(
|
||||
max_budget=_team_max_budget, spend=_team_spend
|
||||
)
|
||||
|
||||
_api_key_spend = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_spend", None
|
||||
)
|
||||
_api_key_max_budget = litellm_params.get("metadata", {}).get(
|
||||
"user_api_key_max_budget", None
|
||||
)
|
||||
_remaining_api_key_budget = safe_get_remaining_budget(
|
||||
max_budget=_api_key_max_budget, spend=_api_key_spend
|
||||
)
|
||||
|
||||
if response_obj is not None:
|
||||
tokens_used = response_obj.get("usage", {}).get("total_tokens", 0)
|
||||
else:
|
||||
|
@ -94,19 +167,47 @@ class PrometheusLogger:
|
|||
user_api_key = hash_token(user_api_key)
|
||||
|
||||
self.litellm_requests_metric.labels(
|
||||
end_user_id, user_api_key, model, user_api_team, user_id
|
||||
end_user_id,
|
||||
user_api_key,
|
||||
model,
|
||||
user_api_team,
|
||||
user_api_team_alias,
|
||||
user_id,
|
||||
).inc()
|
||||
self.litellm_spend_metric.labels(
|
||||
end_user_id, user_api_key, model, user_api_team, user_id
|
||||
end_user_id,
|
||||
user_api_key,
|
||||
model,
|
||||
user_api_team,
|
||||
user_api_team_alias,
|
||||
user_id,
|
||||
).inc(response_cost)
|
||||
self.litellm_tokens_metric.labels(
|
||||
end_user_id, user_api_key, model, user_api_team, user_id
|
||||
end_user_id,
|
||||
user_api_key,
|
||||
model,
|
||||
user_api_team,
|
||||
user_api_team_alias,
|
||||
user_id,
|
||||
).inc(tokens_used)
|
||||
|
||||
self.litellm_remaining_team_budget_metric.labels(
|
||||
user_api_team, user_api_team_alias
|
||||
).set(_remaining_team_budget)
|
||||
|
||||
self.litellm_remaining_api_key_budget_metric.labels(
|
||||
user_api_key, user_api_key_alias
|
||||
).set(_remaining_api_key_budget)
|
||||
|
||||
### FAILURE INCREMENT ###
|
||||
if "exception" in kwargs:
|
||||
self.litellm_llm_api_failed_requests_metric.labels(
|
||||
end_user_id, user_api_key, model, user_api_team, user_id
|
||||
end_user_id,
|
||||
user_api_key,
|
||||
model,
|
||||
user_api_team,
|
||||
user_api_team_alias,
|
||||
user_id,
|
||||
).inc()
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
|
@ -114,3 +215,15 @@ class PrometheusLogger:
|
|||
)
|
||||
verbose_logger.debug(traceback.format_exc())
|
||||
pass
|
||||
|
||||
|
||||
def safe_get_remaining_budget(
|
||||
max_budget: Optional[float], spend: Optional[float]
|
||||
) -> float:
|
||||
if max_budget is None:
|
||||
return float("inf")
|
||||
|
||||
if spend is None:
|
||||
return max_budget
|
||||
|
||||
return max_budget - spend
|
||||
|
|
|
@ -330,6 +330,7 @@ class SlackAlerting(CustomLogger):
|
|||
messages = "Message not logged. litellm.redact_messages_in_exceptions=True"
|
||||
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
|
||||
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
||||
alerting_metadata: dict = {}
|
||||
if time_difference_float > self.alerting_threshold:
|
||||
# add deployment latencies to alert
|
||||
if (
|
||||
|
@ -337,7 +338,7 @@ class SlackAlerting(CustomLogger):
|
|||
and "litellm_params" in kwargs
|
||||
and "metadata" in kwargs["litellm_params"]
|
||||
):
|
||||
_metadata = kwargs["litellm_params"]["metadata"]
|
||||
_metadata: dict = kwargs["litellm_params"]["metadata"]
|
||||
request_info = litellm.utils._add_key_name_and_team_to_alert(
|
||||
request_info=request_info, metadata=_metadata
|
||||
)
|
||||
|
@ -349,10 +350,14 @@ class SlackAlerting(CustomLogger):
|
|||
request_info += (
|
||||
f"\nAvailable Deployment Latencies\n{_deployment_latency_map}"
|
||||
)
|
||||
|
||||
if "alerting_metadata" in _metadata:
|
||||
alerting_metadata = _metadata["alerting_metadata"]
|
||||
await self.send_alert(
|
||||
message=slow_message + request_info,
|
||||
level="Low",
|
||||
alert_type="llm_too_slow",
|
||||
alerting_metadata=alerting_metadata,
|
||||
)
|
||||
|
||||
async def async_update_daily_reports(
|
||||
|
@ -540,7 +545,12 @@ class SlackAlerting(CustomLogger):
|
|||
message += f"\n\nNext Run is at: `{time.time() + self.alerting_args.daily_report_frequency}`s"
|
||||
|
||||
# send alert
|
||||
await self.send_alert(message=message, level="Low", alert_type="daily_reports")
|
||||
await self.send_alert(
|
||||
message=message,
|
||||
level="Low",
|
||||
alert_type="daily_reports",
|
||||
alerting_metadata={},
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
@ -582,6 +592,7 @@ class SlackAlerting(CustomLogger):
|
|||
await asyncio.sleep(
|
||||
self.alerting_threshold
|
||||
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
|
||||
alerting_metadata: dict = {}
|
||||
if (
|
||||
request_data is not None
|
||||
and request_data.get("litellm_status", "") != "success"
|
||||
|
@ -606,7 +617,7 @@ class SlackAlerting(CustomLogger):
|
|||
):
|
||||
# In hanging requests sometime it has not made it to the point where the deployment is passed to the `request_data``
|
||||
# in that case we fallback to the api base set in the request metadata
|
||||
_metadata = request_data["metadata"]
|
||||
_metadata: dict = request_data["metadata"]
|
||||
_api_base = _metadata.get("api_base", "")
|
||||
|
||||
request_info = litellm.utils._add_key_name_and_team_to_alert(
|
||||
|
@ -615,6 +626,9 @@ class SlackAlerting(CustomLogger):
|
|||
|
||||
if _api_base is None:
|
||||
_api_base = ""
|
||||
|
||||
if "alerting_metadata" in _metadata:
|
||||
alerting_metadata = _metadata["alerting_metadata"]
|
||||
request_info += f"\nAPI Base: `{_api_base}`"
|
||||
# only alert hanging responses if they have not been marked as success
|
||||
alerting_message = (
|
||||
|
@ -640,6 +654,7 @@ class SlackAlerting(CustomLogger):
|
|||
message=alerting_message + request_info,
|
||||
level="Medium",
|
||||
alert_type="llm_requests_hanging",
|
||||
alerting_metadata=alerting_metadata,
|
||||
)
|
||||
|
||||
async def failed_tracking_alert(self, error_message: str):
|
||||
|
@ -650,7 +665,10 @@ class SlackAlerting(CustomLogger):
|
|||
result = await _cache.async_get_cache(key=_cache_key)
|
||||
if result is None:
|
||||
await self.send_alert(
|
||||
message=message, level="High", alert_type="budget_alerts"
|
||||
message=message,
|
||||
level="High",
|
||||
alert_type="budget_alerts",
|
||||
alerting_metadata={},
|
||||
)
|
||||
await _cache.async_set_cache(
|
||||
key=_cache_key,
|
||||
|
@ -680,7 +698,7 @@ class SlackAlerting(CustomLogger):
|
|||
return
|
||||
if "budget_alerts" not in self.alert_types:
|
||||
return
|
||||
_id: str = "default_id" # used for caching
|
||||
_id: Optional[str] = "default_id" # used for caching
|
||||
user_info_json = user_info.model_dump(exclude_none=True)
|
||||
for k, v in user_info_json.items():
|
||||
user_info_str = "\n{}: {}\n".format(k, v)
|
||||
|
@ -751,6 +769,7 @@ class SlackAlerting(CustomLogger):
|
|||
level="High",
|
||||
alert_type="budget_alerts",
|
||||
user_info=webhook_event,
|
||||
alerting_metadata={},
|
||||
)
|
||||
await _cache.async_set_cache(
|
||||
key=_cache_key,
|
||||
|
@ -769,7 +788,13 @@ class SlackAlerting(CustomLogger):
|
|||
response_cost: Optional[float],
|
||||
max_budget: Optional[float],
|
||||
):
|
||||
if end_user_id is not None and token is not None and response_cost is not None:
|
||||
if (
|
||||
self.alerting is not None
|
||||
and "webhook" in self.alerting
|
||||
and end_user_id is not None
|
||||
and token is not None
|
||||
and response_cost is not None
|
||||
):
|
||||
# log customer spend
|
||||
event = WebhookEvent(
|
||||
spend=response_cost,
|
||||
|
@ -941,7 +966,10 @@ class SlackAlerting(CustomLogger):
|
|||
)
|
||||
# send minor alert
|
||||
await self.send_alert(
|
||||
message=msg, level="Medium", alert_type="outage_alerts"
|
||||
message=msg,
|
||||
level="Medium",
|
||||
alert_type="outage_alerts",
|
||||
alerting_metadata={},
|
||||
)
|
||||
# set to true
|
||||
outage_value["minor_alert_sent"] = True
|
||||
|
@ -963,7 +991,12 @@ class SlackAlerting(CustomLogger):
|
|||
)
|
||||
|
||||
# send minor alert
|
||||
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
|
||||
await self.send_alert(
|
||||
message=msg,
|
||||
level="High",
|
||||
alert_type="outage_alerts",
|
||||
alerting_metadata={},
|
||||
)
|
||||
# set to true
|
||||
outage_value["major_alert_sent"] = True
|
||||
|
||||
|
@ -1062,7 +1095,10 @@ class SlackAlerting(CustomLogger):
|
|||
)
|
||||
# send minor alert
|
||||
await self.send_alert(
|
||||
message=msg, level="Medium", alert_type="outage_alerts"
|
||||
message=msg,
|
||||
level="Medium",
|
||||
alert_type="outage_alerts",
|
||||
alerting_metadata={},
|
||||
)
|
||||
# set to true
|
||||
outage_value["minor_alert_sent"] = True
|
||||
|
@ -1081,7 +1117,10 @@ class SlackAlerting(CustomLogger):
|
|||
)
|
||||
# send minor alert
|
||||
await self.send_alert(
|
||||
message=msg, level="High", alert_type="outage_alerts"
|
||||
message=msg,
|
||||
level="High",
|
||||
alert_type="outage_alerts",
|
||||
alerting_metadata={},
|
||||
)
|
||||
# set to true
|
||||
outage_value["major_alert_sent"] = True
|
||||
|
@ -1143,7 +1182,10 @@ Model Info:
|
|||
"""
|
||||
|
||||
alert_val = self.send_alert(
|
||||
message=message, level="Low", alert_type="new_model_added"
|
||||
message=message,
|
||||
level="Low",
|
||||
alert_type="new_model_added",
|
||||
alerting_metadata={},
|
||||
)
|
||||
|
||||
if alert_val is not None and asyncio.iscoroutine(alert_val):
|
||||
|
@ -1159,6 +1201,9 @@ Model Info:
|
|||
Currently only implemented for budget alerts
|
||||
|
||||
Returns -> True if sent, False if not.
|
||||
|
||||
Raises Exception
|
||||
- if WEBHOOK_URL is not set
|
||||
"""
|
||||
|
||||
webhook_url = os.getenv("WEBHOOK_URL", None)
|
||||
|
@ -1297,7 +1342,9 @@ Model Info:
|
|||
verbose_proxy_logger.error("Error sending email alert %s", str(e))
|
||||
return False
|
||||
|
||||
async def send_email_alert_using_smtp(self, webhook_event: WebhookEvent) -> bool:
|
||||
async def send_email_alert_using_smtp(
|
||||
self, webhook_event: WebhookEvent, alert_type: str
|
||||
) -> bool:
|
||||
"""
|
||||
Sends structured Email alert to an SMTP server
|
||||
|
||||
|
@ -1306,7 +1353,6 @@ Model Info:
|
|||
Returns -> True if sent, False if not.
|
||||
"""
|
||||
from litellm.proxy.utils import send_email
|
||||
|
||||
from litellm.proxy.proxy_server import premium_user, prisma_client
|
||||
|
||||
email_logo_url = os.getenv(
|
||||
|
@ -1360,6 +1406,10 @@ Model Info:
|
|||
subject=email_event["subject"],
|
||||
html=email_event["html"],
|
||||
)
|
||||
if webhook_event.event_group == "team":
|
||||
from litellm.integrations.email_alerting import send_team_budget_alert
|
||||
|
||||
await send_team_budget_alert(webhook_event=webhook_event)
|
||||
|
||||
return False
|
||||
|
||||
|
@ -1368,6 +1418,7 @@ Model Info:
|
|||
message: str,
|
||||
level: Literal["Low", "Medium", "High"],
|
||||
alert_type: Literal[AlertType],
|
||||
alerting_metadata: dict,
|
||||
user_info: Optional[WebhookEvent] = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -1401,7 +1452,9 @@ Model Info:
|
|||
and user_info is not None
|
||||
):
|
||||
# only send budget alerts over Email
|
||||
await self.send_email_alert_using_smtp(webhook_event=user_info)
|
||||
await self.send_email_alert_using_smtp(
|
||||
webhook_event=user_info, alert_type=alert_type
|
||||
)
|
||||
|
||||
if "slack" not in self.alerting:
|
||||
return
|
||||
|
@ -1425,6 +1478,9 @@ Model Info:
|
|||
if kwargs:
|
||||
for key, value in kwargs.items():
|
||||
formatted_message += f"\n\n{key}: `{value}`\n\n"
|
||||
if alerting_metadata:
|
||||
for key, value in alerting_metadata.items():
|
||||
formatted_message += f"\n\n*Alerting Metadata*: \n{key}: `{value}`\n\n"
|
||||
if _proxy_base_url is not None:
|
||||
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
|
||||
|
||||
|
@ -1440,7 +1496,7 @@ Model Info:
|
|||
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
|
||||
|
||||
if slack_webhook_url is None:
|
||||
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
|
||||
raise ValueError("Missing SLACK_WEBHOOK_URL from environment")
|
||||
payload = {"text": formatted_message}
|
||||
headers = {"Content-type": "application/json"}
|
||||
|
||||
|
@ -1453,7 +1509,7 @@ Model Info:
|
|||
pass
|
||||
else:
|
||||
verbose_proxy_logger.debug(
|
||||
"Error sending slack alert. Error=", response.text
|
||||
"Error sending slack alert. Error={}".format(response.text)
|
||||
)
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
|
@ -1622,6 +1678,7 @@ Model Info:
|
|||
message=_weekly_spend_message,
|
||||
level="Low",
|
||||
alert_type="spend_reports",
|
||||
alerting_metadata={},
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.error("Error sending weekly spend report", e)
|
||||
|
@ -1673,6 +1730,7 @@ Model Info:
|
|||
message=_spend_message,
|
||||
level="Low",
|
||||
alert_type="spend_reports",
|
||||
alerting_metadata={},
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.error("Error sending weekly spend report", e)
|
||||
|
|
41
litellm/litellm_core_utils/core_helpers.py
Normal file
41
litellm/litellm_core_utils/core_helpers.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# What is this?
|
||||
## Helper utilities for the model response objects
|
||||
|
||||
|
||||
def map_finish_reason(
|
||||
finish_reason: str,
|
||||
): # openai supports 5 stop sequences - 'stop', 'length', 'function_call', 'content_filter', 'null'
|
||||
# anthropic mapping
|
||||
if finish_reason == "stop_sequence":
|
||||
return "stop"
|
||||
# cohere mapping - https://docs.cohere.com/reference/generate
|
||||
elif finish_reason == "COMPLETE":
|
||||
return "stop"
|
||||
elif finish_reason == "MAX_TOKENS": # cohere + vertex ai
|
||||
return "length"
|
||||
elif finish_reason == "ERROR_TOXIC":
|
||||
return "content_filter"
|
||||
elif (
|
||||
finish_reason == "ERROR"
|
||||
): # openai currently doesn't support an 'error' finish reason
|
||||
return "stop"
|
||||
# huggingface mapping https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate_stream
|
||||
elif finish_reason == "eos_token" or finish_reason == "stop_sequence":
|
||||
return "stop"
|
||||
elif (
|
||||
finish_reason == "FINISH_REASON_UNSPECIFIED" or finish_reason == "STOP"
|
||||
): # vertex ai - got from running `print(dir(response_obj.candidates[0].finish_reason))`: ['FINISH_REASON_UNSPECIFIED', 'MAX_TOKENS', 'OTHER', 'RECITATION', 'SAFETY', 'STOP',]
|
||||
return "stop"
|
||||
elif finish_reason == "SAFETY": # vertex ai
|
||||
return "content_filter"
|
||||
elif finish_reason == "STOP": # vertex ai
|
||||
return "stop"
|
||||
elif finish_reason == "end_turn" or finish_reason == "stop_sequence": # anthropic
|
||||
return "stop"
|
||||
elif finish_reason == "max_tokens": # anthropic
|
||||
return "length"
|
||||
elif finish_reason == "tool_use": # anthropic
|
||||
return "tool_calls"
|
||||
elif finish_reason == "content_filtered":
|
||||
return "content_filter"
|
||||
return finish_reason
|
1878
litellm/litellm_core_utils/litellm_logging.py
Normal file
1878
litellm/litellm_core_utils/litellm_logging.py
Normal file
File diff suppressed because it is too large
Load diff
210
litellm/litellm_core_utils/llm_cost_calc/google.py
Normal file
210
litellm/litellm_core_utils/llm_cost_calc/google.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
# What is this?
|
||||
## Cost calculation for Google AI Studio / Vertex AI models
|
||||
import traceback
|
||||
from typing import List, Literal, Optional, Tuple
|
||||
|
||||
import litellm
|
||||
from litellm import verbose_logger
|
||||
|
||||
"""
|
||||
Gemini pricing covers:
|
||||
- token
|
||||
- image
|
||||
- audio
|
||||
- video
|
||||
"""
|
||||
|
||||
"""
|
||||
Vertex AI -> character based pricing
|
||||
|
||||
Google AI Studio -> token based pricing
|
||||
"""
|
||||
|
||||
models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"]
|
||||
|
||||
|
||||
def _is_above_128k(tokens: float) -> bool:
|
||||
if tokens > 128000:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def cost_per_character(
|
||||
model: str,
|
||||
custom_llm_provider: str,
|
||||
prompt_tokens: float,
|
||||
completion_tokens: float,
|
||||
prompt_characters: float,
|
||||
completion_characters: float,
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per character for a given VertexAI model, input messages, and response object.
|
||||
|
||||
Input:
|
||||
- model: str, the model name without provider prefix
|
||||
- custom_llm_provider: str, "vertex_ai-*"
|
||||
- prompt_characters: float, the number of input characters
|
||||
- completion_characters: float, the number of output characters
|
||||
|
||||
Returns:
|
||||
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||
|
||||
Raises:
|
||||
Exception if model requires >128k pricing, but model cost not mapped
|
||||
"""
|
||||
model_info = litellm.get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
|
||||
## GET MODEL INFO
|
||||
model_info = litellm.get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
|
||||
## CALCULATE INPUT COST
|
||||
try:
|
||||
if (
|
||||
_is_above_128k(tokens=prompt_characters * 4) # 1 token = 4 char
|
||||
and model not in models_without_dynamic_pricing
|
||||
):
|
||||
## check if character pricing, else default to token pricing
|
||||
assert (
|
||||
"input_cost_per_character_above_128k_tokens" in model_info
|
||||
and model_info["input_cost_per_character_above_128k_tokens"] is not None
|
||||
), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
prompt_cost = (
|
||||
prompt_characters
|
||||
* model_info["input_cost_per_character_above_128k_tokens"]
|
||||
)
|
||||
else:
|
||||
assert (
|
||||
"input_cost_per_character" in model_info
|
||||
and model_info["input_cost_per_character"] is not None
|
||||
), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
prompt_cost = prompt_characters * model_info["input_cost_per_character"]
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
|
||||
Defaulting to (cost_per_token * 4) calculation for prompt_cost".format(
|
||||
str(e), traceback.format_exc()
|
||||
)
|
||||
)
|
||||
initial_prompt_cost, _ = cost_per_token(
|
||||
model=model,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
|
||||
prompt_cost = initial_prompt_cost * 4
|
||||
|
||||
## CALCULATE OUTPUT COST
|
||||
try:
|
||||
if (
|
||||
_is_above_128k(tokens=completion_characters * 4) # 1 token = 4 char
|
||||
and model not in models_without_dynamic_pricing
|
||||
):
|
||||
assert (
|
||||
"output_cost_per_character_above_128k_tokens" in model_info
|
||||
and model_info["output_cost_per_character_above_128k_tokens"]
|
||||
is not None
|
||||
), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
completion_cost = (
|
||||
completion_tokens
|
||||
* model_info["output_cost_per_character_above_128k_tokens"]
|
||||
)
|
||||
else:
|
||||
assert (
|
||||
"output_cost_per_character" in model_info
|
||||
and model_info["output_cost_per_character"] is not None
|
||||
), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
completion_cost = (
|
||||
completion_tokens * model_info["output_cost_per_character"]
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
|
||||
Defaulting to (cost_per_token * 4) calculation for completion_cost".format(
|
||||
str(e), traceback.format_exc()
|
||||
)
|
||||
)
|
||||
_, initial_completion_cost = cost_per_token(
|
||||
model=model,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
|
||||
completion_cost = initial_completion_cost * 4
|
||||
return prompt_cost, completion_cost
|
||||
|
||||
|
||||
def cost_per_token(
|
||||
model: str,
|
||||
custom_llm_provider: str,
|
||||
prompt_tokens: float,
|
||||
completion_tokens: float,
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||
|
||||
Input:
|
||||
- model: str, the model name without provider prefix
|
||||
- custom_llm_provider: str, either "vertex_ai-*" or "gemini"
|
||||
- prompt_tokens: float, the number of input tokens
|
||||
- completion_tokens: float, the number of output tokens
|
||||
|
||||
Returns:
|
||||
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||
|
||||
Raises:
|
||||
Exception if model requires >128k pricing, but model cost not mapped
|
||||
"""
|
||||
## GET MODEL INFO
|
||||
model_info = litellm.get_model_info(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
|
||||
## CALCULATE INPUT COST
|
||||
if (
|
||||
_is_above_128k(tokens=prompt_tokens)
|
||||
and model not in models_without_dynamic_pricing
|
||||
):
|
||||
assert (
|
||||
"input_cost_per_token_above_128k_tokens" in model_info
|
||||
and model_info["input_cost_per_token_above_128k_tokens"] is not None
|
||||
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
prompt_cost = (
|
||||
prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
|
||||
)
|
||||
else:
|
||||
prompt_cost = prompt_tokens * model_info["input_cost_per_token"]
|
||||
|
||||
## CALCULATE OUTPUT COST
|
||||
if (
|
||||
_is_above_128k(tokens=completion_tokens)
|
||||
and model not in models_without_dynamic_pricing
|
||||
):
|
||||
assert (
|
||||
"output_cost_per_token_above_128k_tokens" in model_info
|
||||
and model_info["output_cost_per_token_above_128k_tokens"] is not None
|
||||
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
|
||||
model, model_info
|
||||
)
|
||||
completion_cost = (
|
||||
completion_tokens * model_info["output_cost_per_token_above_128k_tokens"]
|
||||
)
|
||||
else:
|
||||
completion_cost = completion_tokens * model_info["output_cost_per_token"]
|
||||
|
||||
return prompt_cost, completion_cost
|
28
litellm/litellm_core_utils/llm_request_utils.py
Normal file
28
litellm/litellm_core_utils/llm_request_utils.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def _ensure_extra_body_is_safe(extra_body: Optional[Dict]) -> Optional[Dict]:
|
||||
"""
|
||||
Ensure that the extra_body sent in the request is safe, otherwise users will see this error
|
||||
|
||||
"Object of type TextPromptClient is not JSON serializable
|
||||
|
||||
|
||||
Relevant Issue: https://github.com/BerriAI/litellm/issues/4140
|
||||
"""
|
||||
if extra_body is None:
|
||||
return None
|
||||
|
||||
if not isinstance(extra_body, dict):
|
||||
return extra_body
|
||||
|
||||
if "metadata" in extra_body and isinstance(extra_body["metadata"], dict):
|
||||
if "prompt" in extra_body["metadata"]:
|
||||
_prompt = extra_body["metadata"].get("prompt")
|
||||
|
||||
# users can send Langfuse TextPromptClient objects, so we need to convert them to dicts
|
||||
# Langfuse TextPromptClients have .__dict__ attribute
|
||||
if _prompt is not None and hasattr(_prompt, "__dict__"):
|
||||
extra_body["metadata"]["prompt"] = _prompt.__dict__
|
||||
|
||||
return extra_body
|
71
litellm/litellm_core_utils/redact_messages.py
Normal file
71
litellm/litellm_core_utils/redact_messages.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# +-----------------------------------------------+
|
||||
# | |
|
||||
# | Give Feedback / Get Help |
|
||||
# | https://github.com/BerriAI/litellm/issues/new |
|
||||
# | |
|
||||
# +-----------------------------------------------+
|
||||
#
|
||||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import copy
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import litellm
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.litellm_core_utils.litellm_logging import (
|
||||
Logging as _LiteLLMLoggingObject,
|
||||
)
|
||||
|
||||
LiteLLMLoggingObject = _LiteLLMLoggingObject
|
||||
else:
|
||||
LiteLLMLoggingObject = Any
|
||||
|
||||
|
||||
def redact_message_input_output_from_logging(
|
||||
litellm_logging_obj: LiteLLMLoggingObject, result
|
||||
):
|
||||
"""
|
||||
Removes messages, prompts, input, response from logging. This modifies the data in-place
|
||||
only redacts when litellm.turn_off_message_logging == True
|
||||
"""
|
||||
# check if user opted out of logging message/response to callbacks
|
||||
if litellm.turn_off_message_logging is not True:
|
||||
return result
|
||||
|
||||
# remove messages, prompts, input, response from logging
|
||||
litellm_logging_obj.model_call_details["messages"] = [
|
||||
{"role": "user", "content": "redacted-by-litellm"}
|
||||
]
|
||||
litellm_logging_obj.model_call_details["prompt"] = ""
|
||||
litellm_logging_obj.model_call_details["input"] = ""
|
||||
|
||||
# response cleaning
|
||||
# ChatCompletion Responses
|
||||
if (
|
||||
litellm_logging_obj.stream is True
|
||||
and "complete_streaming_response" in litellm_logging_obj.model_call_details
|
||||
):
|
||||
_streaming_response = litellm_logging_obj.model_call_details[
|
||||
"complete_streaming_response"
|
||||
]
|
||||
for choice in _streaming_response.choices:
|
||||
if isinstance(choice, litellm.Choices):
|
||||
choice.message.content = "redacted-by-litellm"
|
||||
elif isinstance(choice, litellm.utils.StreamingChoices):
|
||||
choice.delta.content = "redacted-by-litellm"
|
||||
else:
|
||||
if result is not None:
|
||||
if isinstance(result, litellm.ModelResponse):
|
||||
# only deep copy litellm.ModelResponse
|
||||
_result = copy.deepcopy(result)
|
||||
if hasattr(_result, "choices") and _result.choices is not None:
|
||||
for choice in _result.choices:
|
||||
if isinstance(choice, litellm.Choices):
|
||||
choice.message.content = "redacted-by-litellm"
|
||||
elif isinstance(choice, litellm.utils.StreamingChoices):
|
||||
choice.delta.content = "redacted-by-litellm"
|
||||
|
||||
return _result
|
||||
|
||||
# by default return result
|
||||
return result
|
|
@ -5,10 +5,16 @@ import requests, copy # type: ignore
|
|||
import time
|
||||
from functools import partial
|
||||
from typing import Callable, Optional, List, Union
|
||||
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
|
||||
import litellm.litellm_core_utils
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
AsyncHTTPHandler,
|
||||
_get_async_httpx_client,
|
||||
_get_httpx_client,
|
||||
)
|
||||
from .base import BaseLLM
|
||||
import httpx # type: ignore
|
||||
from litellm.types.llms.anthropic import AnthropicMessagesToolChoice
|
||||
|
@ -171,7 +177,7 @@ async def make_call(
|
|||
logging_obj,
|
||||
):
|
||||
if client is None:
|
||||
client = AsyncHTTPHandler() # Create a new client if none provided
|
||||
client = _get_async_httpx_client() # Create a new client if none provided
|
||||
|
||||
response = await client.post(api_base, headers=headers, data=data, stream=True)
|
||||
|
||||
|
@ -201,7 +207,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: ModelResponse,
|
||||
stream: bool,
|
||||
logging_obj: litellm.utils.Logging,
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
@ -316,7 +322,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: ModelResponse,
|
||||
stream: bool,
|
||||
logging_obj: litellm.utils.Logging,
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
@ -463,9 +469,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
logger_fn=None,
|
||||
headers={},
|
||||
) -> Union[ModelResponse, CustomStreamWrapper]:
|
||||
async_handler = AsyncHTTPHandler(
|
||||
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
|
||||
)
|
||||
async_handler = _get_async_httpx_client()
|
||||
response = await async_handler.post(api_base, headers=headers, json=data)
|
||||
if stream and _is_function_call:
|
||||
return self.process_streaming_response(
|
||||
|
|
|
@ -1,42 +1,56 @@
|
|||
from typing import Optional, Union, Any, Literal, Coroutine, Iterable
|
||||
from typing_extensions import overload
|
||||
import types, requests
|
||||
from .base import BaseLLM
|
||||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
Choices,
|
||||
Message,
|
||||
CustomStreamWrapper,
|
||||
convert_to_model_response_object,
|
||||
TranscriptionResponse,
|
||||
get_secret,
|
||||
UnsupportedParamsError,
|
||||
)
|
||||
from typing import Callable, Optional, BinaryIO, List
|
||||
from litellm import OpenAIConfig
|
||||
import litellm, json
|
||||
import httpx # type: ignore
|
||||
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
||||
from openai import AzureOpenAI, AsyncAzureOpenAI
|
||||
import uuid
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import types
|
||||
import uuid
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Callable,
|
||||
Coroutine,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
import httpx # type: ignore
|
||||
import requests
|
||||
from openai import AsyncAzureOpenAI, AzureOpenAI
|
||||
from typing_extensions import overload
|
||||
|
||||
import litellm
|
||||
from litellm import OpenAIConfig
|
||||
from litellm.caching import DualCache
|
||||
from litellm.utils import (
|
||||
Choices,
|
||||
CustomStreamWrapper,
|
||||
Message,
|
||||
ModelResponse,
|
||||
TranscriptionResponse,
|
||||
UnsupportedParamsError,
|
||||
convert_to_model_response_object,
|
||||
get_secret,
|
||||
)
|
||||
|
||||
from ..types.llms.openai import (
|
||||
AsyncCursorPage,
|
||||
AssistantToolParam,
|
||||
SyncCursorPage,
|
||||
Assistant,
|
||||
MessageData,
|
||||
OpenAIMessage,
|
||||
OpenAICreateThreadParamsMessage,
|
||||
Thread,
|
||||
AssistantToolParam,
|
||||
Run,
|
||||
AssistantEventHandler,
|
||||
AssistantStreamManager,
|
||||
AssistantToolParam,
|
||||
AsyncAssistantEventHandler,
|
||||
AsyncAssistantStreamManager,
|
||||
AssistantStreamManager,
|
||||
AsyncCursorPage,
|
||||
MessageData,
|
||||
OpenAICreateThreadParamsMessage,
|
||||
OpenAIMessage,
|
||||
Run,
|
||||
SyncCursorPage,
|
||||
Thread,
|
||||
)
|
||||
from litellm.caching import DualCache
|
||||
from .base import BaseLLM
|
||||
from .custom_httpx.azure_dall_e_2 import AsyncCustomHTTPTransport, CustomHTTPTransport
|
||||
|
||||
azure_ad_cache = DualCache()
|
||||
|
||||
|
@ -313,7 +327,9 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):
|
|||
def get_azure_ad_token_from_oidc(azure_ad_token: str):
|
||||
azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
|
||||
azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
|
||||
azure_authority_host = os.getenv("AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com")
|
||||
azure_authority_host = os.getenv(
|
||||
"AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
|
||||
)
|
||||
|
||||
if azure_client_id is None or azure_tenant_id is None:
|
||||
raise AzureOpenAIError(
|
||||
|
@ -329,12 +345,14 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
|
|||
message="OIDC token could not be retrieved from secret manager.",
|
||||
)
|
||||
|
||||
azure_ad_token_cache_key = json.dumps({
|
||||
"azure_client_id": azure_client_id,
|
||||
"azure_tenant_id": azure_tenant_id,
|
||||
"azure_authority_host": azure_authority_host,
|
||||
"oidc_token": oidc_token,
|
||||
})
|
||||
azure_ad_token_cache_key = json.dumps(
|
||||
{
|
||||
"azure_client_id": azure_client_id,
|
||||
"azure_tenant_id": azure_tenant_id,
|
||||
"azure_authority_host": azure_authority_host,
|
||||
"oidc_token": oidc_token,
|
||||
}
|
||||
)
|
||||
|
||||
azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
|
||||
if azure_ad_token_access_token is not None:
|
||||
|
@ -371,7 +389,11 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
|
|||
status_code=422, message="Azure AD Token expires_in not returned"
|
||||
)
|
||||
|
||||
azure_ad_cache.set_cache(key=azure_ad_token_cache_key, value=azure_ad_token_access_token, ttl=azure_ad_token_expires_in)
|
||||
azure_ad_cache.set_cache(
|
||||
key=azure_ad_token_cache_key,
|
||||
value=azure_ad_token_access_token,
|
||||
ttl=azure_ad_token_expires_in,
|
||||
)
|
||||
|
||||
return azure_ad_token_access_token
|
||||
|
||||
|
@ -645,6 +667,8 @@ class AzureChatCompletion(BaseLLM):
|
|||
except AzureOpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
except asyncio.CancelledError as e:
|
||||
raise AzureOpenAIError(status_code=500, message=str(e))
|
||||
except Exception as e:
|
||||
if hasattr(e, "status_code"):
|
||||
raise e
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import litellm
|
||||
import httpx, requests
|
||||
from typing import Optional, Union
|
||||
from litellm.utils import Logging
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging
|
||||
|
||||
|
||||
class BaseLLM:
|
||||
|
@ -27,6 +27,25 @@ class BaseLLM:
|
|||
"""
|
||||
return model_response
|
||||
|
||||
def process_text_completion_response(
|
||||
self,
|
||||
model: str,
|
||||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: litellm.utils.TextCompletionResponse,
|
||||
stream: bool,
|
||||
logging_obj: Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
messages: list,
|
||||
print_verbose,
|
||||
encoding,
|
||||
) -> Union[litellm.utils.TextCompletionResponse, litellm.utils.CustomStreamWrapper]:
|
||||
"""
|
||||
Helper function to process the response across sync + async completion calls
|
||||
"""
|
||||
return model_response
|
||||
|
||||
def create_client_session(self):
|
||||
if litellm.client_session:
|
||||
_client_session = litellm.client_session
|
||||
|
|
|
@ -1,25 +1,27 @@
|
|||
import json, copy, types
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import types
|
||||
import uuid
|
||||
from enum import Enum
|
||||
import time, uuid
|
||||
from typing import Callable, Optional, Any, Union, List
|
||||
from typing import Any, Callable, List, Optional, Union
|
||||
|
||||
import httpx
|
||||
|
||||
import litellm
|
||||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
get_secret,
|
||||
Usage,
|
||||
ImageResponse,
|
||||
map_finish_reason,
|
||||
)
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
from litellm.types.utils import ImageResponse, ModelResponse, Usage
|
||||
from litellm.utils import get_secret
|
||||
|
||||
from .prompt_templates.factory import (
|
||||
prompt_factory,
|
||||
custom_prompt,
|
||||
construct_tool_use_system_prompt,
|
||||
contains_tag,
|
||||
custom_prompt,
|
||||
extract_between_tags,
|
||||
parse_xml_params,
|
||||
contains_tag,
|
||||
prompt_factory,
|
||||
)
|
||||
import httpx
|
||||
|
||||
|
||||
class BedrockError(Exception):
|
||||
|
@ -633,7 +635,11 @@ def init_bedrock_client(
|
|||
config = boto3.session.Config()
|
||||
|
||||
### CHECK STS ###
|
||||
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
|
||||
if (
|
||||
aws_web_identity_token is not None
|
||||
and aws_role_name is not None
|
||||
and aws_session_name is not None
|
||||
):
|
||||
oidc_token = get_secret(aws_web_identity_token)
|
||||
|
||||
if oidc_token is None:
|
||||
|
@ -642,9 +648,7 @@ def init_bedrock_client(
|
|||
status_code=401,
|
||||
)
|
||||
|
||||
sts_client = boto3.client(
|
||||
"sts"
|
||||
)
|
||||
sts_client = boto3.client("sts")
|
||||
|
||||
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
|
||||
|
@ -726,38 +730,31 @@ def init_bedrock_client(
|
|||
|
||||
def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
|
||||
# handle anthropic prompts and amazon titan prompts
|
||||
if provider == "anthropic" or provider == "amazon":
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details["roles"],
|
||||
initial_prompt_value=model_prompt_details["initial_prompt_value"],
|
||||
final_prompt_value=model_prompt_details["final_prompt_value"],
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
chat_template_provider = ["anthropic", "amazon", "mistral", "meta"]
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details["roles"],
|
||||
initial_prompt_value=model_prompt_details["initial_prompt_value"],
|
||||
final_prompt_value=model_prompt_details["final_prompt_value"],
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
if provider in chat_template_provider:
|
||||
prompt = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||
)
|
||||
elif provider == "mistral":
|
||||
prompt = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||
)
|
||||
elif provider == "meta":
|
||||
prompt = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||
)
|
||||
else:
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
if "role" in message:
|
||||
if message["role"] == "user":
|
||||
prompt += f"{message['content']}"
|
||||
else:
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
if "role" in message:
|
||||
if message["role"] == "user":
|
||||
prompt += f"{message['content']}"
|
||||
else:
|
||||
prompt += f"{message['content']}"
|
||||
else:
|
||||
prompt += f"{message['content']}"
|
||||
else:
|
||||
prompt += f"{message['content']}"
|
||||
return prompt
|
||||
|
||||
|
||||
|
|
|
@ -22,13 +22,12 @@ from typing import (
|
|||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
Usage,
|
||||
map_finish_reason,
|
||||
CustomStreamWrapper,
|
||||
Message,
|
||||
Choices,
|
||||
get_secret,
|
||||
Logging,
|
||||
)
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging
|
||||
from litellm.types.utils import Message, Choices
|
||||
import litellm, uuid
|
||||
from .prompt_templates.factory import (
|
||||
prompt_factory,
|
||||
|
@ -41,7 +40,12 @@ from .prompt_templates.factory import (
|
|||
_bedrock_converse_messages_pt,
|
||||
_bedrock_tools_pt,
|
||||
)
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
AsyncHTTPHandler,
|
||||
HTTPHandler,
|
||||
_get_async_httpx_client,
|
||||
_get_httpx_client,
|
||||
)
|
||||
from .base import BaseLLM
|
||||
import httpx # type: ignore
|
||||
from .bedrock import BedrockError, convert_messages_to_prompt, ModelResponseIterator
|
||||
|
@ -57,6 +61,7 @@ from litellm.caching import DualCache
|
|||
|
||||
iam_cache = DualCache()
|
||||
|
||||
|
||||
class AmazonCohereChatConfig:
|
||||
"""
|
||||
Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-cohere-command-r-plus.html
|
||||
|
@ -167,7 +172,7 @@ async def make_call(
|
|||
logging_obj,
|
||||
):
|
||||
if client is None:
|
||||
client = AsyncHTTPHandler() # Create a new client if none provided
|
||||
client = _get_async_httpx_client() # Create a new client if none provided
|
||||
|
||||
response = await client.post(api_base, headers=headers, data=data, stream=True)
|
||||
|
||||
|
@ -198,7 +203,7 @@ def make_sync_call(
|
|||
logging_obj,
|
||||
):
|
||||
if client is None:
|
||||
client = HTTPHandler() # Create a new client if none provided
|
||||
client = _get_httpx_client() # Create a new client if none provided
|
||||
|
||||
response = client.post(api_base, headers=headers, data=data, stream=True)
|
||||
|
||||
|
@ -327,13 +332,19 @@ class BedrockLLM(BaseLLM):
|
|||
) = params_to_check
|
||||
|
||||
### CHECK STS ###
|
||||
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
|
||||
iam_creds_cache_key = json.dumps({
|
||||
"aws_web_identity_token": aws_web_identity_token,
|
||||
"aws_role_name": aws_role_name,
|
||||
"aws_session_name": aws_session_name,
|
||||
"aws_region_name": aws_region_name,
|
||||
})
|
||||
if (
|
||||
aws_web_identity_token is not None
|
||||
and aws_role_name is not None
|
||||
and aws_session_name is not None
|
||||
):
|
||||
iam_creds_cache_key = json.dumps(
|
||||
{
|
||||
"aws_web_identity_token": aws_web_identity_token,
|
||||
"aws_role_name": aws_role_name,
|
||||
"aws_session_name": aws_session_name,
|
||||
"aws_region_name": aws_region_name,
|
||||
}
|
||||
)
|
||||
|
||||
iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
|
||||
if iam_creds_dict is None:
|
||||
|
@ -348,7 +359,7 @@ class BedrockLLM(BaseLLM):
|
|||
sts_client = boto3.client(
|
||||
"sts",
|
||||
region_name=aws_region_name,
|
||||
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
|
||||
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
|
||||
)
|
||||
|
||||
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
|
||||
|
@ -362,12 +373,18 @@ class BedrockLLM(BaseLLM):
|
|||
|
||||
iam_creds_dict = {
|
||||
"aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
|
||||
"aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
|
||||
"aws_secret_access_key": sts_response["Credentials"][
|
||||
"SecretAccessKey"
|
||||
],
|
||||
"aws_session_token": sts_response["Credentials"]["SessionToken"],
|
||||
"region_name": aws_region_name,
|
||||
}
|
||||
|
||||
iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
|
||||
iam_cache.set_cache(
|
||||
key=iam_creds_cache_key,
|
||||
value=json.dumps(iam_creds_dict),
|
||||
ttl=3600 - 60,
|
||||
)
|
||||
|
||||
session = boto3.Session(**iam_creds_dict)
|
||||
|
||||
|
@ -976,7 +993,7 @@ class BedrockLLM(BaseLLM):
|
|||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = timeout
|
||||
self.client = HTTPHandler(**_params) # type: ignore
|
||||
self.client = _get_httpx_client(_params) # type: ignore
|
||||
else:
|
||||
self.client = client
|
||||
if (stream is not None and stream == True) and provider != "ai21":
|
||||
|
@ -1058,7 +1075,7 @@ class BedrockLLM(BaseLLM):
|
|||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = timeout
|
||||
client = AsyncHTTPHandler(**_params) # type: ignore
|
||||
client = _get_async_httpx_client(_params) # type: ignore
|
||||
else:
|
||||
client = client # type: ignore
|
||||
|
||||
|
@ -1433,13 +1450,19 @@ class BedrockConverseLLM(BaseLLM):
|
|||
) = params_to_check
|
||||
|
||||
### CHECK STS ###
|
||||
if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
|
||||
iam_creds_cache_key = json.dumps({
|
||||
"aws_web_identity_token": aws_web_identity_token,
|
||||
"aws_role_name": aws_role_name,
|
||||
"aws_session_name": aws_session_name,
|
||||
"aws_region_name": aws_region_name,
|
||||
})
|
||||
if (
|
||||
aws_web_identity_token is not None
|
||||
and aws_role_name is not None
|
||||
and aws_session_name is not None
|
||||
):
|
||||
iam_creds_cache_key = json.dumps(
|
||||
{
|
||||
"aws_web_identity_token": aws_web_identity_token,
|
||||
"aws_role_name": aws_role_name,
|
||||
"aws_session_name": aws_session_name,
|
||||
"aws_region_name": aws_region_name,
|
||||
}
|
||||
)
|
||||
|
||||
iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
|
||||
if iam_creds_dict is None:
|
||||
|
@ -1454,7 +1477,7 @@ class BedrockConverseLLM(BaseLLM):
|
|||
sts_client = boto3.client(
|
||||
"sts",
|
||||
region_name=aws_region_name,
|
||||
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
|
||||
endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
|
||||
)
|
||||
|
||||
# https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
|
||||
|
@ -1468,12 +1491,18 @@ class BedrockConverseLLM(BaseLLM):
|
|||
|
||||
iam_creds_dict = {
|
||||
"aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
|
||||
"aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
|
||||
"aws_secret_access_key": sts_response["Credentials"][
|
||||
"SecretAccessKey"
|
||||
],
|
||||
"aws_session_token": sts_response["Credentials"]["SessionToken"],
|
||||
"region_name": aws_region_name,
|
||||
}
|
||||
|
||||
iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
|
||||
iam_cache.set_cache(
|
||||
key=iam_creds_cache_key,
|
||||
value=json.dumps(iam_creds_dict),
|
||||
ttl=3600 - 60,
|
||||
)
|
||||
|
||||
session = boto3.Session(**iam_creds_dict)
|
||||
|
||||
|
@ -1575,7 +1604,7 @@ class BedrockConverseLLM(BaseLLM):
|
|||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = timeout
|
||||
client = AsyncHTTPHandler(**_params) # type: ignore
|
||||
client = _get_async_httpx_client(_params) # type: ignore
|
||||
else:
|
||||
client = client # type: ignore
|
||||
|
||||
|
@ -1847,7 +1876,7 @@ class BedrockConverseLLM(BaseLLM):
|
|||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = timeout
|
||||
client = HTTPHandler(**_params) # type: ignore
|
||||
client = _get_httpx_client(_params) # type: ignore
|
||||
else:
|
||||
client = client
|
||||
try:
|
||||
|
|
|
@ -12,6 +12,15 @@ class AsyncHTTPHandler:
|
|||
timeout: Optional[Union[float, httpx.Timeout]] = None,
|
||||
concurrent_limit=1000,
|
||||
):
|
||||
self.timeout = timeout
|
||||
self.client = self.create_client(
|
||||
timeout=timeout, concurrent_limit=concurrent_limit
|
||||
)
|
||||
|
||||
def create_client(
|
||||
self, timeout: Optional[Union[float, httpx.Timeout]], concurrent_limit: int
|
||||
) -> httpx.AsyncClient:
|
||||
|
||||
async_proxy_mounts = None
|
||||
# Check if the HTTP_PROXY and HTTPS_PROXY environment variables are set and use them accordingly.
|
||||
http_proxy = os.getenv("HTTP_PROXY", None)
|
||||
|
@ -39,7 +48,8 @@ class AsyncHTTPHandler:
|
|||
if timeout is None:
|
||||
timeout = _DEFAULT_TIMEOUT
|
||||
# Create a client with a connection pool
|
||||
self.client = httpx.AsyncClient(
|
||||
|
||||
return httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
limits=httpx.Limits(
|
||||
max_connections=concurrent_limit,
|
||||
|
@ -83,11 +93,48 @@ class AsyncHTTPHandler:
|
|||
response = await self.client.send(req, stream=stream)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except httpx.RemoteProtocolError:
|
||||
# Retry the request with a new session if there is a connection error
|
||||
new_client = self.create_client(timeout=self.timeout, concurrent_limit=1)
|
||||
try:
|
||||
return await self.single_connection_post_request(
|
||||
url=url,
|
||||
client=new_client,
|
||||
data=data,
|
||||
json=json,
|
||||
params=params,
|
||||
headers=headers,
|
||||
stream=stream,
|
||||
)
|
||||
finally:
|
||||
await new_client.aclose()
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
async def single_connection_post_request(
|
||||
self,
|
||||
url: str,
|
||||
client: httpx.AsyncClient,
|
||||
data: Optional[Union[dict, str]] = None, # type: ignore
|
||||
json: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
stream: bool = False,
|
||||
):
|
||||
"""
|
||||
Making POST request for a single connection client.
|
||||
|
||||
Used for retrying connection client errors.
|
||||
"""
|
||||
req = client.build_request(
|
||||
"POST", url, data=data, json=json, params=params, headers=headers # type: ignore
|
||||
)
|
||||
response = await client.send(req, stream=stream)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def __del__(self) -> None:
|
||||
try:
|
||||
asyncio.get_running_loop().create_task(self.close())
|
||||
|
@ -172,3 +219,60 @@ class HTTPHandler:
|
|||
self.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _get_async_httpx_client(params: Optional[dict] = None) -> AsyncHTTPHandler:
|
||||
"""
|
||||
Retrieves the async HTTP client from the cache
|
||||
If not present, creates a new client
|
||||
|
||||
Caches the new client and returns it.
|
||||
"""
|
||||
_params_key_name = ""
|
||||
if params is not None:
|
||||
for key, value in params.items():
|
||||
try:
|
||||
_params_key_name += f"{key}_{value}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_cache_key_name = "async_httpx_client" + _params_key_name
|
||||
if _cache_key_name in litellm.in_memory_llm_clients_cache:
|
||||
return litellm.in_memory_llm_clients_cache[_cache_key_name]
|
||||
|
||||
if params is not None:
|
||||
_new_client = AsyncHTTPHandler(**params)
|
||||
else:
|
||||
_new_client = AsyncHTTPHandler(
|
||||
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
|
||||
)
|
||||
litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
|
||||
return _new_client
|
||||
|
||||
|
||||
def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
|
||||
"""
|
||||
Retrieves the HTTP client from the cache
|
||||
If not present, creates a new client
|
||||
|
||||
Caches the new client and returns it.
|
||||
"""
|
||||
_params_key_name = ""
|
||||
if params is not None:
|
||||
for key, value in params.items():
|
||||
try:
|
||||
_params_key_name += f"{key}_{value}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_cache_key_name = "httpx_client" + _params_key_name
|
||||
if _cache_key_name in litellm.in_memory_llm_clients_cache:
|
||||
return litellm.in_memory_llm_clients_cache[_cache_key_name]
|
||||
|
||||
if params is not None:
|
||||
_new_client = HTTPHandler(**params)
|
||||
else:
|
||||
_new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
|
||||
|
||||
litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
|
||||
return _new_client
|
||||
|
|
|
@ -10,10 +10,10 @@ from typing import Callable, Optional, List, Union, Tuple, Literal
|
|||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
Usage,
|
||||
map_finish_reason,
|
||||
CustomStreamWrapper,
|
||||
EmbeddingResponse,
|
||||
)
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
|
@ -289,7 +289,7 @@ class DatabricksChatCompletion(BaseLLM):
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: ModelResponse,
|
||||
stream: bool,
|
||||
logging_obj: litellm.utils.Logging,
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
|
|
@ -1,14 +1,22 @@
|
|||
import types
|
||||
import traceback
|
||||
####################################
|
||||
######### DEPRECATED FILE ##########
|
||||
####################################
|
||||
# logic moved to `vertex_httpx.py` #
|
||||
|
||||
import copy
|
||||
import time
|
||||
import traceback
|
||||
import types
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Choices, Message, Usage
|
||||
import litellm
|
||||
|
||||
import httpx
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt, get_system_prompt
|
||||
from packaging.version import Version
|
||||
|
||||
import litellm
|
||||
from litellm import verbose_logger
|
||||
from litellm.utils import Choices, Message, ModelResponse, Usage
|
||||
|
||||
from .prompt_templates.factory import custom_prompt, get_system_prompt, prompt_factory
|
||||
|
||||
|
||||
class GeminiError(Exception):
|
||||
|
@ -186,8 +194,8 @@ def completion(
|
|||
if _system_instruction and len(system_prompt) > 0:
|
||||
_params["system_instruction"] = system_prompt
|
||||
_model = genai.GenerativeModel(**_params)
|
||||
if stream == True:
|
||||
if acompletion == True:
|
||||
if stream is True:
|
||||
if acompletion is True:
|
||||
|
||||
async def async_streaming():
|
||||
try:
|
||||
|
|
|
@ -1,33 +1,41 @@
|
|||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import traceback
|
||||
import types
|
||||
from typing import (
|
||||
Optional,
|
||||
Union,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Literal,
|
||||
Callable,
|
||||
Coroutine,
|
||||
Iterable,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
import hashlib
|
||||
from typing_extensions import override, overload
|
||||
from pydantic import BaseModel
|
||||
import types, time, json, traceback
|
||||
|
||||
import httpx
|
||||
from .base import BaseLLM
|
||||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
Choices,
|
||||
Message,
|
||||
CustomStreamWrapper,
|
||||
convert_to_model_response_object,
|
||||
Usage,
|
||||
TranscriptionResponse,
|
||||
TextCompletionResponse,
|
||||
)
|
||||
from typing import Callable, Optional, Coroutine
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from openai import OpenAI, AsyncOpenAI
|
||||
from ..types.llms.openai import *
|
||||
import openai
|
||||
from openai import AsyncOpenAI, OpenAI
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import overload, override
|
||||
|
||||
import litellm
|
||||
from litellm.types.utils import ProviderField
|
||||
from litellm.utils import (
|
||||
Choices,
|
||||
CustomStreamWrapper,
|
||||
Message,
|
||||
ModelResponse,
|
||||
TextCompletionResponse,
|
||||
TranscriptionResponse,
|
||||
Usage,
|
||||
convert_to_model_response_object,
|
||||
)
|
||||
|
||||
from ..types.llms.openai import *
|
||||
from .base import BaseLLM
|
||||
from .prompt_templates.factory import custom_prompt, prompt_factory
|
||||
|
||||
|
||||
class OpenAIError(Exception):
|
||||
|
@ -207,6 +215,25 @@ class MistralEmbeddingConfig:
|
|||
return optional_params
|
||||
|
||||
|
||||
class AzureAIStudioConfig:
|
||||
def get_required_params(self) -> List[ProviderField]:
|
||||
"""For a given provider, return it's required fields with a description"""
|
||||
return [
|
||||
ProviderField(
|
||||
field_name="api_key",
|
||||
field_type="string",
|
||||
field_description="Your Azure AI Studio API Key.",
|
||||
field_value="zEJ...",
|
||||
),
|
||||
ProviderField(
|
||||
field_name="api_base",
|
||||
field_type="string",
|
||||
field_description="Your Azure AI Studio API Base.",
|
||||
field_value="https://Mistral-serverless.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class DeepInfraConfig:
|
||||
"""
|
||||
Reference: https://deepinfra.com/docs/advanced/openai_api
|
||||
|
@ -286,8 +313,12 @@ class DeepInfraConfig:
|
|||
]
|
||||
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict, model: str
|
||||
):
|
||||
self,
|
||||
non_default_params: dict,
|
||||
optional_params: dict,
|
||||
model: str,
|
||||
drop_params: bool,
|
||||
) -> dict:
|
||||
supported_openai_params = self.get_supported_openai_params()
|
||||
for param, value in non_default_params.items():
|
||||
if (
|
||||
|
@ -296,8 +327,23 @@ class DeepInfraConfig:
|
|||
and model == "mistralai/Mistral-7B-Instruct-v0.1"
|
||||
): # this model does no support temperature == 0
|
||||
value = 0.0001 # close to 0
|
||||
if param == "tool_choice":
|
||||
if (
|
||||
value != "auto" and value != "none"
|
||||
): # https://deepinfra.com/docs/advanced/function_calling
|
||||
## UNSUPPORTED TOOL CHOICE VALUE
|
||||
if litellm.drop_params is True or drop_params is True:
|
||||
value = None
|
||||
else:
|
||||
raise litellm.utils.UnsupportedParamsError(
|
||||
message="Deepinfra doesn't support tool_choice={}. To drop unsupported openai params from the call, set `litellm.drop_params = True`".format(
|
||||
value
|
||||
),
|
||||
status_code=400,
|
||||
)
|
||||
if param in supported_openai_params:
|
||||
optional_params[param] = value
|
||||
if value is not None:
|
||||
optional_params[param] = value
|
||||
return optional_params
|
||||
|
||||
|
||||
|
@ -1530,6 +1576,7 @@ class OpenAITextCompletion(BaseLLM):
|
|||
response = openai_client.completions.create(**data) # type: ignore
|
||||
|
||||
response_json = response.model_dump()
|
||||
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
|
|
|
@ -12,11 +12,11 @@ from typing import Callable, Optional, List, Literal, Union
|
|||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
Usage,
|
||||
map_finish_reason,
|
||||
CustomStreamWrapper,
|
||||
Message,
|
||||
Choices,
|
||||
)
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
|
@ -198,7 +198,7 @@ class PredibaseChatCompletion(BaseLLM):
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: ModelResponse,
|
||||
stream: bool,
|
||||
logging_obj: litellm.utils.Logging,
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
import json
|
||||
import re
|
||||
import traceback
|
||||
import uuid
|
||||
import xml.etree.ElementTree as ET
|
||||
from enum import Enum
|
||||
import requests, traceback
|
||||
import json, re, xml.etree.ElementTree as ET
|
||||
from jinja2 import Template, exceptions, meta, BaseLoader
|
||||
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||
from typing import Any, List, Mapping, MutableMapping, Optional, Sequence, Tuple
|
||||
|
||||
import requests
|
||||
from jinja2 import BaseLoader, Template, exceptions, meta
|
||||
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||
|
||||
import litellm
|
||||
import litellm.types
|
||||
from litellm.types.completion import (
|
||||
ChatCompletionUserMessageParam,
|
||||
ChatCompletionSystemMessageParam,
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionFunctionMessageParam,
|
||||
ChatCompletionMessageToolCallParam,
|
||||
ChatCompletionToolMessageParam,
|
||||
)
|
||||
import litellm.types.llms
|
||||
from litellm.types.llms.anthropic import *
|
||||
import uuid
|
||||
from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
|
||||
import litellm.types.llms.vertex_ai
|
||||
from litellm.types.completion import (
|
||||
ChatCompletionFunctionMessageParam,
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionMessageToolCallParam,
|
||||
ChatCompletionSystemMessageParam,
|
||||
ChatCompletionToolMessageParam,
|
||||
ChatCompletionUserMessageParam,
|
||||
)
|
||||
from litellm.types.llms.anthropic import *
|
||||
from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
|
||||
from litellm.types.utils import GenericImageParsingChunk
|
||||
|
||||
|
||||
def default_pt(messages):
|
||||
|
@ -622,9 +628,10 @@ def construct_tool_use_system_prompt(
|
|||
|
||||
|
||||
def convert_url_to_base64(url):
|
||||
import requests
|
||||
import base64
|
||||
|
||||
import requests
|
||||
|
||||
for _ in range(3):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
@ -654,7 +661,7 @@ def convert_url_to_base64(url):
|
|||
raise Exception(f"Error: Unable to fetch image from URL. url={url}")
|
||||
|
||||
|
||||
def convert_to_anthropic_image_obj(openai_image_url: str):
|
||||
def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
|
||||
"""
|
||||
Input:
|
||||
"image_url": "data:image/jpeg;base64,{base64_image}",
|
||||
|
@ -675,11 +682,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str):
|
|||
# Infer image format from the URL
|
||||
image_format = openai_image_url.split("data:image/")[1].split(";base64,")[0]
|
||||
|
||||
return {
|
||||
"type": "base64",
|
||||
"media_type": f"image/{image_format}",
|
||||
"data": base64_data,
|
||||
}
|
||||
return GenericImageParsingChunk(
|
||||
type="base64",
|
||||
media_type=f"image/{image_format}",
|
||||
data=base64_data,
|
||||
)
|
||||
except Exception as e:
|
||||
if "Error: Unable to fetch image from URL" in str(e):
|
||||
raise e
|
||||
|
@ -1606,19 +1613,23 @@ def azure_text_pt(messages: list):
|
|||
|
||||
###### AMAZON BEDROCK #######
|
||||
|
||||
from litellm.types.llms.bedrock import ContentBlock as BedrockContentBlock
|
||||
from litellm.types.llms.bedrock import ImageBlock as BedrockImageBlock
|
||||
from litellm.types.llms.bedrock import ImageSourceBlock as BedrockImageSourceBlock
|
||||
from litellm.types.llms.bedrock import ToolBlock as BedrockToolBlock
|
||||
from litellm.types.llms.bedrock import (
|
||||
ToolResultContentBlock as BedrockToolResultContentBlock,
|
||||
ToolResultBlock as BedrockToolResultBlock,
|
||||
ToolConfigBlock as BedrockToolConfigBlock,
|
||||
ToolUseBlock as BedrockToolUseBlock,
|
||||
ImageSourceBlock as BedrockImageSourceBlock,
|
||||
ImageBlock as BedrockImageBlock,
|
||||
ContentBlock as BedrockContentBlock,
|
||||
ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
|
||||
ToolSpecBlock as BedrockToolSpecBlock,
|
||||
ToolBlock as BedrockToolBlock,
|
||||
ToolChoiceValuesBlock as BedrockToolChoiceValuesBlock,
|
||||
)
|
||||
from litellm.types.llms.bedrock import ToolConfigBlock as BedrockToolConfigBlock
|
||||
from litellm.types.llms.bedrock import (
|
||||
ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
|
||||
)
|
||||
from litellm.types.llms.bedrock import ToolResultBlock as BedrockToolResultBlock
|
||||
from litellm.types.llms.bedrock import (
|
||||
ToolResultContentBlock as BedrockToolResultContentBlock,
|
||||
)
|
||||
from litellm.types.llms.bedrock import ToolSpecBlock as BedrockToolSpecBlock
|
||||
from litellm.types.llms.bedrock import ToolUseBlock as BedrockToolUseBlock
|
||||
|
||||
|
||||
def get_image_details(image_url) -> Tuple[str, str]:
|
||||
|
@ -1655,7 +1666,8 @@ def get_image_details(image_url) -> Tuple[str, str]:
|
|||
def _process_bedrock_converse_image_block(image_url: str) -> BedrockImageBlock:
|
||||
if "base64" in image_url:
|
||||
# Case 1: Images with base64 encoding
|
||||
import base64, re
|
||||
import base64
|
||||
import re
|
||||
|
||||
# base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
|
||||
image_metadata, img_without_base_64 = image_url.split(",")
|
||||
|
|
532
litellm/llms/text_completion_codestral.py
Normal file
532
litellm/llms/text_completion_codestral.py
Normal file
|
@ -0,0 +1,532 @@
|
|||
# What is this?
|
||||
## Controller file for TextCompletionCodestral Integration - https://codestral.com/
|
||||
|
||||
from functools import partial
|
||||
import os, types
|
||||
import traceback
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests, copy # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional, List, Literal, Union
|
||||
from litellm.utils import (
|
||||
TextCompletionResponse,
|
||||
Usage,
|
||||
CustomStreamWrapper,
|
||||
Message,
|
||||
Choices,
|
||||
)
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
from litellm.types.llms.databricks import GenericStreamingChunk
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
from .base import BaseLLM
|
||||
import httpx # type: ignore
|
||||
|
||||
|
||||
class TextCompletionCodestralError(Exception):
|
||||
def __init__(
|
||||
self,
|
||||
status_code,
|
||||
message,
|
||||
request: Optional[httpx.Request] = None,
|
||||
response: Optional[httpx.Response] = None,
|
||||
):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
if request is not None:
|
||||
self.request = request
|
||||
else:
|
||||
self.request = httpx.Request(
|
||||
method="POST",
|
||||
url="https://docs.codestral.com/user-guide/inference/rest_api",
|
||||
)
|
||||
if response is not None:
|
||||
self.response = response
|
||||
else:
|
||||
self.response = httpx.Response(
|
||||
status_code=status_code, request=self.request
|
||||
)
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
async def make_call(
|
||||
client: AsyncHTTPHandler,
|
||||
api_base: str,
|
||||
headers: dict,
|
||||
data: str,
|
||||
model: str,
|
||||
messages: list,
|
||||
logging_obj,
|
||||
):
|
||||
response = await client.post(api_base, headers=headers, data=data, stream=True)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise TextCompletionCodestralError(
|
||||
status_code=response.status_code, message=response.text
|
||||
)
|
||||
|
||||
completion_stream = response.aiter_lines()
|
||||
# LOGGING
|
||||
logging_obj.post_call(
|
||||
input=messages,
|
||||
api_key="",
|
||||
original_response=completion_stream, # Pass the completion stream for logging
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
|
||||
return completion_stream
|
||||
|
||||
|
||||
class MistralTextCompletionConfig:
|
||||
"""
|
||||
Reference: https://docs.mistral.ai/api/#operation/createFIMCompletion
|
||||
"""
|
||||
|
||||
suffix: Optional[str] = None
|
||||
temperature: Optional[int] = None
|
||||
top_p: Optional[float] = None
|
||||
max_tokens: Optional[int] = None
|
||||
min_tokens: Optional[int] = None
|
||||
stream: Optional[bool] = None
|
||||
random_seed: Optional[int] = None
|
||||
stop: Optional[str] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
suffix: Optional[str] = None,
|
||||
temperature: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
min_tokens: Optional[int] = None,
|
||||
stream: Optional[bool] = None,
|
||||
random_seed: Optional[int] = None,
|
||||
stop: Optional[str] = None,
|
||||
) -> None:
|
||||
locals_ = locals().copy()
|
||||
for key, value in locals_.items():
|
||||
if key != "self" and value is not None:
|
||||
setattr(self.__class__, key, value)
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
def get_supported_openai_params(self):
|
||||
return [
|
||||
"suffix",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"seed",
|
||||
"stop",
|
||||
]
|
||||
|
||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||
for param, value in non_default_params.items():
|
||||
if param == "suffix":
|
||||
optional_params["suffix"] = value
|
||||
if param == "temperature":
|
||||
optional_params["temperature"] = value
|
||||
if param == "top_p":
|
||||
optional_params["top_p"] = value
|
||||
if param == "max_tokens":
|
||||
optional_params["max_tokens"] = value
|
||||
if param == "stream" and value == True:
|
||||
optional_params["stream"] = value
|
||||
if param == "stop":
|
||||
optional_params["stop"] = value
|
||||
if param == "seed":
|
||||
optional_params["random_seed"] = value
|
||||
if param == "min_tokens":
|
||||
optional_params["min_tokens"] = value
|
||||
|
||||
return optional_params
|
||||
|
||||
def _chunk_parser(self, chunk_data: str) -> GenericStreamingChunk:
|
||||
text = ""
|
||||
is_finished = False
|
||||
finish_reason = None
|
||||
logprobs = None
|
||||
|
||||
chunk_data = chunk_data.replace("data:", "")
|
||||
chunk_data = chunk_data.strip()
|
||||
if len(chunk_data) == 0 or chunk_data == "[DONE]":
|
||||
return {
|
||||
"text": "",
|
||||
"is_finished": is_finished,
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
chunk_data_dict = json.loads(chunk_data)
|
||||
original_chunk = litellm.ModelResponse(**chunk_data_dict, stream=True)
|
||||
_choices = chunk_data_dict.get("choices", []) or []
|
||||
_choice = _choices[0]
|
||||
text = _choice.get("delta", {}).get("content", "")
|
||||
|
||||
if _choice.get("finish_reason") is not None:
|
||||
is_finished = True
|
||||
finish_reason = _choice.get("finish_reason")
|
||||
logprobs = _choice.get("logprobs")
|
||||
|
||||
return GenericStreamingChunk(
|
||||
text=text,
|
||||
original_chunk=original_chunk,
|
||||
is_finished=is_finished,
|
||||
finish_reason=finish_reason,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
|
||||
class CodestralTextCompletion(BaseLLM):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _validate_environment(
|
||||
self,
|
||||
api_key: Optional[str],
|
||||
user_headers: dict,
|
||||
) -> dict:
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"Missing CODESTRAL_API_Key - Please add CODESTRAL_API_Key to your environment variables"
|
||||
)
|
||||
headers = {
|
||||
"content-type": "application/json",
|
||||
"Authorization": "Bearer {}".format(api_key),
|
||||
}
|
||||
if user_headers is not None and isinstance(user_headers, dict):
|
||||
headers = {**headers, **user_headers}
|
||||
return headers
|
||||
|
||||
def output_parser(self, generated_text: str):
|
||||
"""
|
||||
Parse the output text to remove any special characters. In our current approach we just check for ChatML tokens.
|
||||
|
||||
Initial issue that prompted this - https://github.com/BerriAI/litellm/issues/763
|
||||
"""
|
||||
chat_template_tokens = [
|
||||
"<|assistant|>",
|
||||
"<|system|>",
|
||||
"<|user|>",
|
||||
"<s>",
|
||||
"</s>",
|
||||
]
|
||||
for token in chat_template_tokens:
|
||||
if generated_text.strip().startswith(token):
|
||||
generated_text = generated_text.replace(token, "", 1)
|
||||
if generated_text.endswith(token):
|
||||
generated_text = generated_text[::-1].replace(token[::-1], "", 1)[::-1]
|
||||
return generated_text
|
||||
|
||||
def process_text_completion_response(
|
||||
self,
|
||||
model: str,
|
||||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: TextCompletionResponse,
|
||||
stream: bool,
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
messages: list,
|
||||
print_verbose,
|
||||
encoding,
|
||||
) -> TextCompletionResponse:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
original_response=response.text,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
print_verbose(f"codestral api: raw model_response: {response.text}")
|
||||
## RESPONSE OBJECT
|
||||
if response.status_code != 200:
|
||||
raise TextCompletionCodestralError(
|
||||
message=str(response.text),
|
||||
status_code=response.status_code,
|
||||
)
|
||||
try:
|
||||
completion_response = response.json()
|
||||
except:
|
||||
raise TextCompletionCodestralError(message=response.text, status_code=422)
|
||||
|
||||
_original_choices = completion_response.get("choices", [])
|
||||
_choices: List[litellm.utils.TextChoices] = []
|
||||
for choice in _original_choices:
|
||||
# This is what 1 choice looks like from codestral API
|
||||
# {
|
||||
# "index": 0,
|
||||
# "message": {
|
||||
# "role": "assistant",
|
||||
# "content": "\n assert is_odd(1)\n assert",
|
||||
# "tool_calls": null
|
||||
# },
|
||||
# "finish_reason": "length",
|
||||
# "logprobs": null
|
||||
# }
|
||||
_finish_reason = None
|
||||
_index = 0
|
||||
_text = None
|
||||
_logprobs = None
|
||||
|
||||
_choice_message = choice.get("message", {})
|
||||
_choice = litellm.utils.TextChoices(
|
||||
finish_reason=choice.get("finish_reason"),
|
||||
index=choice.get("index"),
|
||||
text=_choice_message.get("content"),
|
||||
logprobs=choice.get("logprobs"),
|
||||
)
|
||||
|
||||
_choices.append(_choice)
|
||||
|
||||
_response = litellm.TextCompletionResponse(
|
||||
id=completion_response.get("id"),
|
||||
choices=_choices,
|
||||
created=completion_response.get("created"),
|
||||
model=completion_response.get("model"),
|
||||
usage=completion_response.get("usage"),
|
||||
stream=False,
|
||||
object=completion_response.get("object"),
|
||||
)
|
||||
return _response
|
||||
|
||||
def completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: list,
|
||||
api_base: str,
|
||||
custom_prompt_dict: dict,
|
||||
model_response: TextCompletionResponse,
|
||||
print_verbose: Callable,
|
||||
encoding,
|
||||
api_key: str,
|
||||
logging_obj,
|
||||
optional_params: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
acompletion=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
headers: dict = {},
|
||||
) -> Union[TextCompletionResponse, CustomStreamWrapper]:
|
||||
headers = self._validate_environment(api_key, headers)
|
||||
|
||||
completion_url = api_base or "https://codestral.mistral.ai/v1/fim/completions"
|
||||
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details["roles"],
|
||||
initial_prompt_value=model_prompt_details["initial_prompt_value"],
|
||||
final_prompt_value=model_prompt_details["final_prompt_value"],
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
prompt = prompt_factory(model=model, messages=messages)
|
||||
|
||||
## Load Config
|
||||
config = litellm.MistralTextCompletionConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
k not in optional_params
|
||||
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
stream = optional_params.pop("stream", False)
|
||||
|
||||
data = {
|
||||
"prompt": prompt,
|
||||
**optional_params,
|
||||
}
|
||||
input_text = prompt
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"headers": headers,
|
||||
"api_base": completion_url,
|
||||
"acompletion": acompletion,
|
||||
},
|
||||
)
|
||||
## COMPLETION CALL
|
||||
if acompletion is True:
|
||||
### ASYNC STREAMING
|
||||
if stream is True:
|
||||
return self.async_streaming(
|
||||
model=model,
|
||||
messages=messages,
|
||||
data=data,
|
||||
api_base=completion_url,
|
||||
model_response=model_response,
|
||||
print_verbose=print_verbose,
|
||||
encoding=encoding,
|
||||
api_key=api_key,
|
||||
logging_obj=logging_obj,
|
||||
optional_params=optional_params,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
) # type: ignore
|
||||
else:
|
||||
### ASYNC COMPLETION
|
||||
return self.async_completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
data=data,
|
||||
api_base=completion_url,
|
||||
model_response=model_response,
|
||||
print_verbose=print_verbose,
|
||||
encoding=encoding,
|
||||
api_key=api_key,
|
||||
logging_obj=logging_obj,
|
||||
optional_params=optional_params,
|
||||
stream=False,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
) # type: ignore
|
||||
|
||||
### SYNC STREAMING
|
||||
if stream is True:
|
||||
response = requests.post(
|
||||
completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
stream=stream,
|
||||
)
|
||||
_response = CustomStreamWrapper(
|
||||
response.iter_lines(),
|
||||
model,
|
||||
custom_llm_provider="codestral",
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
return _response
|
||||
### SYNC COMPLETION
|
||||
else:
|
||||
response = requests.post(
|
||||
url=completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
)
|
||||
return self.process_text_completion_response(
|
||||
model=model,
|
||||
response=response,
|
||||
model_response=model_response,
|
||||
stream=optional_params.get("stream", False),
|
||||
logging_obj=logging_obj, # type: ignore
|
||||
optional_params=optional_params,
|
||||
api_key=api_key,
|
||||
data=data,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
encoding=encoding,
|
||||
)
|
||||
|
||||
async def async_completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: list,
|
||||
api_base: str,
|
||||
model_response: TextCompletionResponse,
|
||||
print_verbose: Callable,
|
||||
encoding,
|
||||
api_key,
|
||||
logging_obj,
|
||||
stream,
|
||||
data: dict,
|
||||
optional_params: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
headers={},
|
||||
) -> TextCompletionResponse:
|
||||
|
||||
async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
|
||||
try:
|
||||
response = await async_handler.post(
|
||||
api_base, headers=headers, data=json.dumps(data)
|
||||
)
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise TextCompletionCodestralError(
|
||||
status_code=e.response.status_code,
|
||||
message="HTTPStatusError - {}".format(e.response.text),
|
||||
)
|
||||
except Exception as e:
|
||||
raise TextCompletionCodestralError(
|
||||
status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
|
||||
)
|
||||
return self.process_text_completion_response(
|
||||
model=model,
|
||||
response=response,
|
||||
model_response=model_response,
|
||||
stream=stream,
|
||||
logging_obj=logging_obj,
|
||||
api_key=api_key,
|
||||
data=data,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
optional_params=optional_params,
|
||||
encoding=encoding,
|
||||
)
|
||||
|
||||
async def async_streaming(
|
||||
self,
|
||||
model: str,
|
||||
messages: list,
|
||||
api_base: str,
|
||||
model_response: TextCompletionResponse,
|
||||
print_verbose: Callable,
|
||||
encoding,
|
||||
api_key,
|
||||
logging_obj,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
optional_params=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
headers={},
|
||||
) -> CustomStreamWrapper:
|
||||
data["stream"] = True
|
||||
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=None,
|
||||
make_call=partial(
|
||||
make_call,
|
||||
api_base=api_base,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
model=model,
|
||||
messages=messages,
|
||||
logging_obj=logging_obj,
|
||||
),
|
||||
model=model,
|
||||
custom_llm_provider="text-completion-codestral",
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
return streamwrapper
|
||||
|
||||
def embedding(self, *args, **kwargs):
|
||||
pass
|
|
@ -4,7 +4,6 @@ from enum import Enum
|
|||
import requests, copy # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional, List
|
||||
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
|
|
|
@ -1,17 +1,22 @@
|
|||
import os, types
|
||||
import inspect
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests # type: ignore
|
||||
import os
|
||||
import time
|
||||
from typing import Callable, Optional, Union, List, Literal, Any
|
||||
import types
|
||||
import uuid
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, List, Literal, Optional, Union
|
||||
|
||||
import httpx # type: ignore
|
||||
import requests # type: ignore
|
||||
from pydantic import BaseModel
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
|
||||
import litellm, uuid
|
||||
import httpx, inspect # type: ignore
|
||||
from litellm.types.llms.vertex_ai import *
|
||||
|
||||
import litellm
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
from litellm.llms.prompt_templates.factory import (
|
||||
convert_to_gemini_tool_call_result,
|
||||
convert_to_anthropic_image_obj,
|
||||
convert_to_gemini_tool_call_invoke,
|
||||
convert_to_gemini_tool_call_result,
|
||||
)
|
||||
from litellm.types.files import (
|
||||
get_file_mime_type_for_file_type,
|
||||
|
@ -19,6 +24,8 @@ from litellm.types.files import (
|
|||
is_gemini_1_5_accepted_file_type,
|
||||
is_video_file_type,
|
||||
)
|
||||
from litellm.types.llms.vertex_ai import *
|
||||
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
|
||||
|
||||
|
||||
class VertexAIError(Exception):
|
||||
|
@ -273,28 +280,6 @@ def _get_image_bytes_from_url(image_url: str) -> bytes:
|
|||
raise Exception(f"An exception occurs with this image - {str(e)}")
|
||||
|
||||
|
||||
def _load_image_from_url(image_url: str):
|
||||
"""
|
||||
Loads an image from a URL.
|
||||
|
||||
Args:
|
||||
image_url (str): The URL of the image.
|
||||
|
||||
Returns:
|
||||
Image: The loaded image.
|
||||
"""
|
||||
from vertexai.preview.generative_models import (
|
||||
GenerativeModel,
|
||||
Part,
|
||||
GenerationConfig,
|
||||
Image,
|
||||
)
|
||||
|
||||
image_bytes = _get_image_bytes_from_url(image_url)
|
||||
|
||||
return Image.from_bytes(data=image_bytes)
|
||||
|
||||
|
||||
def _convert_gemini_role(role: str) -> Literal["user", "model"]:
|
||||
if role == "user":
|
||||
return "user"
|
||||
|
@ -322,28 +307,9 @@ def _process_gemini_image(image_url: str) -> PartType:
|
|||
return PartType(file_data=file_data)
|
||||
|
||||
# Direct links
|
||||
elif "https:/" in image_url:
|
||||
image = _load_image_from_url(image_url)
|
||||
_blob = BlobType(data=image.data, mime_type=image._mime_type)
|
||||
return PartType(inline_data=_blob)
|
||||
|
||||
# Base64 encoding
|
||||
elif "base64" in image_url:
|
||||
import base64, re
|
||||
|
||||
# base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
|
||||
image_metadata, img_without_base_64 = image_url.split(",")
|
||||
|
||||
# read mime_type from img_without_base_64=data:image/jpeg;base64
|
||||
# Extract MIME type using regular expression
|
||||
mime_type_match = re.match(r"data:(.*?);base64", image_metadata)
|
||||
|
||||
if mime_type_match:
|
||||
mime_type = mime_type_match.group(1)
|
||||
else:
|
||||
mime_type = "image/jpeg"
|
||||
decoded_img = base64.b64decode(img_without_base_64)
|
||||
_blob = BlobType(data=decoded_img, mime_type=mime_type)
|
||||
elif "https:/" in image_url or "base64" in image_url:
|
||||
image = convert_to_anthropic_image_obj(image_url)
|
||||
_blob = BlobType(data=image["data"], mime_type=image["media_type"])
|
||||
return PartType(inline_data=_blob)
|
||||
raise Exception("Invalid image received - {}".format(image_url))
|
||||
except Exception as e:
|
||||
|
@ -371,7 +337,7 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
|
|||
_parts: List[PartType] = []
|
||||
for element in messages[msg_i]["content"]:
|
||||
if isinstance(element, dict):
|
||||
if element["type"] == "text":
|
||||
if element["type"] == "text" and len(element["text"]) > 0:
|
||||
_part = PartType(text=element["text"])
|
||||
_parts.append(_part)
|
||||
elif element["type"] == "image_url":
|
||||
|
@ -379,7 +345,10 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
|
|||
_part = _process_gemini_image(image_url=image_url)
|
||||
_parts.append(_part) # type: ignore
|
||||
user_content.extend(_parts)
|
||||
else:
|
||||
elif (
|
||||
isinstance(messages[msg_i]["content"], str)
|
||||
and len(messages[msg_i]["content"]) > 0
|
||||
):
|
||||
_part = PartType(text=messages[msg_i]["content"])
|
||||
user_content.append(_part)
|
||||
|
||||
|
@ -479,23 +448,25 @@ def completion(
|
|||
message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
|
||||
)
|
||||
try:
|
||||
import google.auth # type: ignore
|
||||
import proto # type: ignore
|
||||
from google.cloud import aiplatform # type: ignore
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
content as gapic_content_types, # type: ignore
|
||||
)
|
||||
from google.protobuf import json_format # type: ignore
|
||||
from google.protobuf.struct_pb2 import Value # type: ignore
|
||||
from vertexai.language_models import CodeGenerationModel, TextGenerationModel
|
||||
from vertexai.preview.generative_models import (
|
||||
GenerationConfig,
|
||||
GenerativeModel,
|
||||
Part,
|
||||
)
|
||||
from vertexai.preview.language_models import (
|
||||
ChatModel,
|
||||
CodeChatModel,
|
||||
InputOutputTextPair,
|
||||
)
|
||||
from vertexai.language_models import TextGenerationModel, CodeGenerationModel
|
||||
from vertexai.preview.generative_models import (
|
||||
GenerativeModel,
|
||||
Part,
|
||||
GenerationConfig,
|
||||
)
|
||||
from google.cloud import aiplatform # type: ignore
|
||||
from google.protobuf import json_format # type: ignore
|
||||
from google.protobuf.struct_pb2 import Value # type: ignore
|
||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore
|
||||
import google.auth # type: ignore
|
||||
import proto # type: ignore
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
print_verbose(
|
||||
|
@ -617,7 +588,7 @@ def completion(
|
|||
llm_model = None
|
||||
|
||||
# NOTE: async prediction and streaming under "private" mode isn't supported by aiplatform right now
|
||||
if acompletion == True:
|
||||
if acompletion is True:
|
||||
data = {
|
||||
"llm_model": llm_model,
|
||||
"mode": mode,
|
||||
|
@ -649,7 +620,7 @@ def completion(
|
|||
tools = optional_params.pop("tools", None)
|
||||
content = _gemini_convert_messages_with_history(messages=messages)
|
||||
stream = optional_params.pop("stream", False)
|
||||
if stream == True:
|
||||
if stream is True:
|
||||
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
|
@ -1411,8 +1382,8 @@ def embedding(
|
|||
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
|
||||
)
|
||||
|
||||
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
|
||||
import google.auth # type: ignore
|
||||
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
try:
|
||||
|
|
|
@ -6,7 +6,8 @@ from enum import Enum
|
|||
import requests, copy # type: ignore
|
||||
import time, uuid
|
||||
from typing import Callable, Optional, List
|
||||
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
|
||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||
import litellm
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from .prompt_templates.factory import (
|
||||
|
@ -237,7 +238,10 @@ def completion(
|
|||
if vertex_credentials is not None and isinstance(vertex_credentials, str):
|
||||
import google.oauth2.service_account
|
||||
|
||||
json_obj = json.loads(vertex_credentials)
|
||||
try:
|
||||
json_obj = json.loads(vertex_credentials)
|
||||
except json.JSONDecodeError:
|
||||
json_obj = json.load(open(vertex_credentials))
|
||||
|
||||
creds = (
|
||||
google.oauth2.service_account.Credentials.from_service_account_info(
|
||||
|
|
File diff suppressed because it is too large
Load diff
435
litellm/main.py
435
litellm/main.py
|
@ -7,107 +7,132 @@
|
|||
#
|
||||
# Thank you ! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import os, openai, sys, json, inspect, uuid, datetime, threading
|
||||
from typing import Any, Literal, Union, BinaryIO
|
||||
from typing_extensions import overload
|
||||
from functools import partial
|
||||
|
||||
import dotenv, traceback, random, asyncio, time, contextvars
|
||||
import asyncio
|
||||
import contextvars
|
||||
import datetime
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
Mapping,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
import dotenv
|
||||
import httpx
|
||||
import openai
|
||||
import tiktoken
|
||||
from typing_extensions import overload
|
||||
|
||||
import litellm
|
||||
from ._logging import verbose_logger
|
||||
from litellm import ( # type: ignore
|
||||
Logging,
|
||||
client,
|
||||
exception_type,
|
||||
get_optional_params,
|
||||
get_litellm_params,
|
||||
Logging,
|
||||
get_optional_params,
|
||||
)
|
||||
from litellm.utils import (
|
||||
get_secret,
|
||||
CustomStreamWrapper,
|
||||
read_config_args,
|
||||
completion_with_fallbacks,
|
||||
get_llm_provider,
|
||||
get_api_key,
|
||||
mock_completion_streaming_obj,
|
||||
Usage,
|
||||
async_mock_completion_streaming_obj,
|
||||
completion_with_fallbacks,
|
||||
convert_to_model_response_object,
|
||||
token_counter,
|
||||
create_pretrained_tokenizer,
|
||||
create_tokenizer,
|
||||
Usage,
|
||||
get_api_key,
|
||||
get_llm_provider,
|
||||
get_optional_params_embeddings,
|
||||
get_optional_params_image_gen,
|
||||
get_secret,
|
||||
mock_completion_streaming_obj,
|
||||
read_config_args,
|
||||
supports_httpx_timeout,
|
||||
token_counter,
|
||||
)
|
||||
|
||||
from ._logging import verbose_logger
|
||||
from .caching import disable_cache, enable_cache, update_cache
|
||||
from .llms import (
|
||||
anthropic_text,
|
||||
together_ai,
|
||||
ai21,
|
||||
sagemaker,
|
||||
bedrock,
|
||||
triton,
|
||||
huggingface_restapi,
|
||||
replicate,
|
||||
aleph_alpha,
|
||||
nlp_cloud,
|
||||
anthropic_text,
|
||||
baseten,
|
||||
vllm,
|
||||
ollama,
|
||||
ollama_chat,
|
||||
cloudflare,
|
||||
bedrock,
|
||||
clarifai,
|
||||
cloudflare,
|
||||
cohere,
|
||||
cohere_chat,
|
||||
petals,
|
||||
gemini,
|
||||
huggingface_restapi,
|
||||
maritalk,
|
||||
nlp_cloud,
|
||||
ollama,
|
||||
ollama_chat,
|
||||
oobabooga,
|
||||
openrouter,
|
||||
palm,
|
||||
gemini,
|
||||
petals,
|
||||
replicate,
|
||||
sagemaker,
|
||||
together_ai,
|
||||
triton,
|
||||
vertex_ai,
|
||||
vertex_ai_anthropic,
|
||||
maritalk,
|
||||
vllm,
|
||||
watsonx,
|
||||
)
|
||||
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
|
||||
from .llms.azure import AzureChatCompletion
|
||||
from .llms.databricks import DatabricksChatCompletion
|
||||
from .llms.azure_text import AzureTextCompletion
|
||||
from .llms.anthropic import AnthropicChatCompletion
|
||||
from .llms.anthropic_text import AnthropicTextCompletion
|
||||
from .llms.azure import AzureChatCompletion
|
||||
from .llms.azure_text import AzureTextCompletion
|
||||
from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
|
||||
from .llms.databricks import DatabricksChatCompletion
|
||||
from .llms.huggingface_restapi import Huggingface
|
||||
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
|
||||
from .llms.predibase import PredibaseChatCompletion
|
||||
from .llms.bedrock_httpx import BedrockLLM, BedrockConverseLLM
|
||||
from .llms.vertex_httpx import VertexLLM
|
||||
from .llms.triton import TritonChatCompletion
|
||||
from .llms.prompt_templates.factory import (
|
||||
prompt_factory,
|
||||
custom_prompt,
|
||||
function_call_prompt,
|
||||
map_system_message_pt,
|
||||
prompt_factory,
|
||||
)
|
||||
import tiktoken
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Callable, List, Optional, Dict, Union, Mapping
|
||||
from .caching import enable_cache, disable_cache, update_cache
|
||||
from .llms.text_completion_codestral import CodestralTextCompletion
|
||||
from .llms.triton import TritonChatCompletion
|
||||
from .llms.vertex_httpx import VertexLLM
|
||||
from .types.llms.openai import HttpxBinaryResponseContent
|
||||
from .types.utils import ChatCompletionMessageToolCall
|
||||
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
||||
from litellm.utils import (
|
||||
get_secret,
|
||||
Choices,
|
||||
CustomStreamWrapper,
|
||||
TextCompletionStreamWrapper,
|
||||
ModelResponse,
|
||||
TextCompletionResponse,
|
||||
TextChoices,
|
||||
EmbeddingResponse,
|
||||
ImageResponse,
|
||||
read_config_args,
|
||||
Choices,
|
||||
Message,
|
||||
ModelResponse,
|
||||
TextChoices,
|
||||
TextCompletionResponse,
|
||||
TextCompletionStreamWrapper,
|
||||
TranscriptionResponse,
|
||||
get_secret,
|
||||
read_config_args,
|
||||
)
|
||||
|
||||
####### ENVIRONMENT VARIABLES ###################
|
||||
|
@ -120,6 +145,7 @@ azure_chat_completions = AzureChatCompletion()
|
|||
azure_text_completions = AzureTextCompletion()
|
||||
huggingface = Huggingface()
|
||||
predibase_chat_completions = PredibaseChatCompletion()
|
||||
codestral_text_completions = CodestralTextCompletion()
|
||||
triton_chat_completions = TritonChatCompletion()
|
||||
bedrock_chat_completion = BedrockLLM()
|
||||
bedrock_converse_chat_completion = BedrockConverseLLM()
|
||||
|
@ -322,6 +348,8 @@ async def acompletion(
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "codestral"
|
||||
or custom_llm_provider == "text-completion-codestral"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
or custom_llm_provider == "huggingface"
|
||||
|
@ -329,6 +357,7 @@ async def acompletion(
|
|||
or custom_llm_provider == "ollama_chat"
|
||||
or custom_llm_provider == "replicate"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
or custom_llm_provider == "vertex_ai_beta"
|
||||
or custom_llm_provider == "gemini"
|
||||
or custom_llm_provider == "sagemaker"
|
||||
or custom_llm_provider == "anthropic"
|
||||
|
@ -350,9 +379,10 @@ async def acompletion(
|
|||
else:
|
||||
response = init_response # type: ignore
|
||||
|
||||
if custom_llm_provider == "text-completion-openai" and isinstance(
|
||||
response, TextCompletionResponse
|
||||
):
|
||||
if (
|
||||
custom_llm_provider == "text-completion-openai"
|
||||
or custom_llm_provider == "text-completion-codestral"
|
||||
) and isinstance(response, TextCompletionResponse):
|
||||
response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
|
||||
response_object=response,
|
||||
model_response_object=litellm.ModelResponse(),
|
||||
|
@ -367,7 +397,9 @@ async def acompletion(
|
|||
return response
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
"litellm.acompletion(): Exception occured - {}".format(str(e))
|
||||
"litellm.acompletion(): Exception occured - {}\n{}".format(
|
||||
str(e), traceback.format_exc()
|
||||
)
|
||||
)
|
||||
verbose_logger.debug(traceback.format_exc())
|
||||
custom_llm_provider = custom_llm_provider or "openai"
|
||||
|
@ -397,7 +429,9 @@ def mock_completion(
|
|||
messages: List,
|
||||
stream: Optional[bool] = False,
|
||||
mock_response: Union[str, Exception] = "This is a mock request",
|
||||
mock_tool_calls: Optional[List] = None,
|
||||
logging=None,
|
||||
custom_llm_provider=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
@ -435,7 +469,7 @@ def mock_completion(
|
|||
raise litellm.APIError(
|
||||
status_code=getattr(mock_response, "status_code", 500), # type: ignore
|
||||
message=getattr(mock_response, "text", str(mock_response)),
|
||||
llm_provider=getattr(mock_response, "llm_provider", "openai"), # type: ignore
|
||||
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
|
||||
model=model, # type: ignore
|
||||
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
|
||||
)
|
||||
|
@ -464,6 +498,12 @@ def mock_completion(
|
|||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = model
|
||||
|
||||
if mock_tool_calls:
|
||||
model_response["choices"][0]["message"]["tool_calls"] = [
|
||||
ChatCompletionMessageToolCall(**tool_call)
|
||||
for tool_call in mock_tool_calls
|
||||
]
|
||||
|
||||
setattr(
|
||||
model_response,
|
||||
"usage",
|
||||
|
@ -577,6 +617,7 @@ def completion(
|
|||
args = locals()
|
||||
api_base = kwargs.get("api_base", None)
|
||||
mock_response = kwargs.get("mock_response", None)
|
||||
mock_tool_calls = kwargs.get("mock_tool_calls", None)
|
||||
force_timeout = kwargs.get("force_timeout", 600) ## deprecated
|
||||
logger_fn = kwargs.get("logger_fn", None)
|
||||
verbose = kwargs.get("verbose", False)
|
||||
|
@ -895,15 +936,17 @@ def completion(
|
|||
litellm_params=litellm_params,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
if mock_response:
|
||||
if mock_response or mock_tool_calls:
|
||||
return mock_completion(
|
||||
model,
|
||||
messages,
|
||||
stream=stream,
|
||||
mock_response=mock_response,
|
||||
mock_tool_calls=mock_tool_calls,
|
||||
logging=logging,
|
||||
acompletion=acompletion,
|
||||
mock_delay=kwargs.get("mock_delay", None),
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
if custom_llm_provider == "azure":
|
||||
# azure configs
|
||||
|
@ -1035,91 +1078,6 @@ def completion(
|
|||
"api_base": api_base,
|
||||
},
|
||||
)
|
||||
elif (
|
||||
model in litellm.open_ai_chat_completion_models
|
||||
or custom_llm_provider == "custom_openai"
|
||||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "anyscale"
|
||||
or custom_llm_provider == "mistral"
|
||||
or custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "together_ai"
|
||||
or custom_llm_provider in litellm.openai_compatible_providers
|
||||
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
||||
): # allow user to make an openai call with a custom base
|
||||
# note: if a user sets a custom base - we should ensure this works
|
||||
# allow for the setting of dynamic and stateful api-bases
|
||||
api_base = (
|
||||
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
|
||||
or litellm.api_base
|
||||
or get_secret("OPENAI_API_BASE")
|
||||
or "https://api.openai.com/v1"
|
||||
)
|
||||
openai.organization = (
|
||||
organization
|
||||
or litellm.organization
|
||||
or get_secret("OPENAI_ORGANIZATION")
|
||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||
)
|
||||
# set API KEY
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
|
||||
or litellm.openai_key
|
||||
or get_secret("OPENAI_API_KEY")
|
||||
)
|
||||
|
||||
headers = headers or litellm.headers
|
||||
|
||||
## LOAD CONFIG - if set
|
||||
config = litellm.OpenAIConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
k not in optional_params
|
||||
): # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
## COMPLETION CALL
|
||||
try:
|
||||
response = openai_chat_completions.completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
headers=headers,
|
||||
model_response=model_response,
|
||||
print_verbose=print_verbose,
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
acompletion=acompletion,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
timeout=timeout, # type: ignore
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
organization=organization,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
except Exception as e:
|
||||
## LOGGING - log the original exception returned
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
original_response=str(e),
|
||||
additional_args={"headers": headers},
|
||||
)
|
||||
raise e
|
||||
|
||||
if optional_params.get("stream", False):
|
||||
## LOGGING
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
original_response=response,
|
||||
additional_args={"headers": headers},
|
||||
)
|
||||
elif (
|
||||
custom_llm_provider == "text-completion-openai"
|
||||
or "ft:babbage-002" in model
|
||||
|
@ -1203,6 +1161,93 @@ def completion(
|
|||
additional_args={"headers": headers},
|
||||
)
|
||||
response = _response
|
||||
|
||||
elif (
|
||||
model in litellm.open_ai_chat_completion_models
|
||||
or custom_llm_provider == "custom_openai"
|
||||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "codestral"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "anyscale"
|
||||
or custom_llm_provider == "mistral"
|
||||
or custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "together_ai"
|
||||
or custom_llm_provider in litellm.openai_compatible_providers
|
||||
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
||||
): # allow user to make an openai call with a custom base
|
||||
# note: if a user sets a custom base - we should ensure this works
|
||||
# allow for the setting of dynamic and stateful api-bases
|
||||
api_base = (
|
||||
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
|
||||
or litellm.api_base
|
||||
or get_secret("OPENAI_API_BASE")
|
||||
or "https://api.openai.com/v1"
|
||||
)
|
||||
openai.organization = (
|
||||
organization
|
||||
or litellm.organization
|
||||
or get_secret("OPENAI_ORGANIZATION")
|
||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||
)
|
||||
# set API KEY
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
|
||||
or litellm.openai_key
|
||||
or get_secret("OPENAI_API_KEY")
|
||||
)
|
||||
|
||||
headers = headers or litellm.headers
|
||||
|
||||
## LOAD CONFIG - if set
|
||||
config = litellm.OpenAIConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
k not in optional_params
|
||||
): # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
## COMPLETION CALL
|
||||
try:
|
||||
response = openai_chat_completions.completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
headers=headers,
|
||||
model_response=model_response,
|
||||
print_verbose=print_verbose,
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
acompletion=acompletion,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
timeout=timeout, # type: ignore
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
organization=organization,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
except Exception as e:
|
||||
## LOGGING - log the original exception returned
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
original_response=str(e),
|
||||
additional_args={"headers": headers},
|
||||
)
|
||||
raise e
|
||||
|
||||
if optional_params.get("stream", False):
|
||||
## LOGGING
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=api_key,
|
||||
original_response=response,
|
||||
additional_args={"headers": headers},
|
||||
)
|
||||
elif (
|
||||
"replicate" in model
|
||||
or custom_llm_provider == "replicate"
|
||||
|
@ -1840,7 +1885,25 @@ def completion(
|
|||
)
|
||||
return response
|
||||
response = model_response
|
||||
elif custom_llm_provider == "gemini":
|
||||
elif custom_llm_provider == "vertex_ai_beta" or custom_llm_provider == "gemini":
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_project", None)
|
||||
or optional_params.pop("vertex_ai_project", None)
|
||||
or litellm.vertex_project
|
||||
or get_secret("VERTEXAI_PROJECT")
|
||||
)
|
||||
vertex_ai_location = (
|
||||
optional_params.pop("vertex_location", None)
|
||||
or optional_params.pop("vertex_ai_location", None)
|
||||
or litellm.vertex_location
|
||||
or get_secret("VERTEXAI_LOCATION")
|
||||
)
|
||||
vertex_credentials = (
|
||||
optional_params.pop("vertex_credentials", None)
|
||||
or optional_params.pop("vertex_ai_credentials", None)
|
||||
or get_secret("VERTEXAI_CREDENTIALS")
|
||||
)
|
||||
|
||||
gemini_api_key = (
|
||||
api_key
|
||||
or get_secret("GEMINI_API_KEY")
|
||||
|
@ -1848,34 +1911,28 @@ def completion(
|
|||
or litellm.api_key
|
||||
)
|
||||
|
||||
# palm does not support streaming as yet :(
|
||||
model_response = gemini.completion(
|
||||
new_params = deepcopy(optional_params)
|
||||
response = vertex_chat_completion.completion( # type: ignore
|
||||
model=model,
|
||||
messages=messages,
|
||||
model_response=model_response,
|
||||
print_verbose=print_verbose,
|
||||
optional_params=optional_params,
|
||||
optional_params=new_params,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
encoding=encoding,
|
||||
api_key=gemini_api_key,
|
||||
vertex_location=vertex_ai_location,
|
||||
vertex_project=vertex_ai_project,
|
||||
vertex_credentials=vertex_credentials,
|
||||
gemini_api_key=gemini_api_key,
|
||||
logging_obj=logging,
|
||||
acompletion=acompletion,
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
timeout=timeout,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
client=client,
|
||||
api_base=api_base,
|
||||
)
|
||||
if (
|
||||
"stream" in optional_params
|
||||
and optional_params["stream"] == True
|
||||
and acompletion == False
|
||||
):
|
||||
response = CustomStreamWrapper(
|
||||
iter(model_response),
|
||||
model,
|
||||
custom_llm_provider="gemini",
|
||||
logging_obj=logging,
|
||||
)
|
||||
return response
|
||||
response = model_response
|
||||
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_project", None)
|
||||
|
@ -1894,6 +1951,7 @@ def completion(
|
|||
or optional_params.pop("vertex_ai_credentials", None)
|
||||
or get_secret("VERTEXAI_CREDENTIALS")
|
||||
)
|
||||
|
||||
new_params = deepcopy(optional_params)
|
||||
if "claude-3" in model:
|
||||
model_response = vertex_ai_anthropic.completion(
|
||||
|
@ -1982,6 +2040,46 @@ def completion(
|
|||
timeout=timeout,
|
||||
)
|
||||
|
||||
if (
|
||||
"stream" in optional_params
|
||||
and optional_params["stream"] is True
|
||||
and acompletion is False
|
||||
):
|
||||
return _model_response
|
||||
response = _model_response
|
||||
elif custom_llm_provider == "text-completion-codestral":
|
||||
|
||||
api_base = (
|
||||
api_base
|
||||
or optional_params.pop("api_base", None)
|
||||
or optional_params.pop("base_url", None)
|
||||
or litellm.api_base
|
||||
or "https://codestral.mistral.ai/v1/fim/completions"
|
||||
)
|
||||
|
||||
api_key = api_key or litellm.api_key or get_secret("CODESTRAL_API_KEY")
|
||||
|
||||
text_completion_model_response = litellm.TextCompletionResponse(
|
||||
stream=stream
|
||||
)
|
||||
|
||||
_model_response = codestral_text_completions.completion( # type: ignore
|
||||
model=model,
|
||||
messages=messages,
|
||||
model_response=text_completion_model_response,
|
||||
print_verbose=print_verbose,
|
||||
optional_params=optional_params,
|
||||
litellm_params=litellm_params,
|
||||
logger_fn=logger_fn,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
acompletion=acompletion,
|
||||
api_base=api_base,
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
api_key=api_key,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if (
|
||||
"stream" in optional_params
|
||||
and optional_params["stream"] is True
|
||||
|
@ -3371,7 +3469,9 @@ def embedding(
|
|||
|
||||
###### Text Completion ################
|
||||
@client
|
||||
async def atext_completion(*args, **kwargs):
|
||||
async def atext_completion(
|
||||
*args, **kwargs
|
||||
) -> Union[TextCompletionResponse, TextCompletionStreamWrapper]:
|
||||
"""
|
||||
Implemented to handle async streaming for the text completion endpoint
|
||||
"""
|
||||
|
@ -3403,6 +3503,7 @@ async def atext_completion(*args, **kwargs):
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "text-completion-codestral"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "fireworks_ai"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
|
@ -3664,6 +3765,7 @@ def text_completion(
|
|||
custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "azure"
|
||||
or custom_llm_provider == "azure_text"
|
||||
or custom_llm_provider == "text-completion-codestral"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
)
|
||||
and isinstance(prompt, list)
|
||||
|
@ -3680,6 +3782,12 @@ def text_completion(
|
|||
)
|
||||
|
||||
kwargs.pop("prompt", None)
|
||||
|
||||
if model is not None and model.startswith(
|
||||
"openai/"
|
||||
): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
|
||||
model = model.replace("openai/", "text-completion-openai/")
|
||||
|
||||
kwargs["text_completion"] = True
|
||||
response = completion(
|
||||
model=model,
|
||||
|
@ -3842,6 +3950,7 @@ def image_generation(
|
|||
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||
model_info = kwargs.get("model_info", None)
|
||||
metadata = kwargs.get("metadata", {})
|
||||
client = kwargs.get("client", None)
|
||||
|
||||
model_response = litellm.utils.ImageResponse()
|
||||
if model is not None or custom_llm_provider is not None:
|
||||
|
@ -3980,6 +4089,7 @@ def image_generation(
|
|||
model_response=model_response,
|
||||
api_version=api_version,
|
||||
aimg_generation=aimg_generation,
|
||||
client=client,
|
||||
)
|
||||
elif custom_llm_provider == "openai":
|
||||
model_response = openai_chat_completions.image_generation(
|
||||
|
@ -3992,6 +4102,7 @@ def image_generation(
|
|||
optional_params=optional_params,
|
||||
model_response=model_response,
|
||||
aimg_generation=aimg_generation,
|
||||
client=client,
|
||||
)
|
||||
elif custom_llm_provider == "bedrock":
|
||||
if model is None:
|
||||
|
|
|
@ -234,6 +234,30 @@
|
|||
"litellm_provider": "openai",
|
||||
"mode": "chat"
|
||||
},
|
||||
"ft:gpt-4-0613": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
"litellm_provider": "openai",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
|
||||
},
|
||||
"ft:gpt-4o-2024-05-13": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"litellm_provider": "openai",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_parallel_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
|
||||
},
|
||||
"ft:davinci-002": {
|
||||
"max_tokens": 16384,
|
||||
"max_input_tokens": 16384,
|
||||
|
@ -499,7 +523,7 @@
|
|||
"max_tokens": 4096,
|
||||
"max_input_tokens": 16384,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"input_cost_per_token": 0.000001,
|
||||
"output_cost_per_token": 0.000002,
|
||||
"litellm_provider": "azure",
|
||||
"mode": "chat",
|
||||
|
@ -841,7 +865,7 @@
|
|||
},
|
||||
"deepseek-coder": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 16000,
|
||||
"max_input_tokens": 32000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000014,
|
||||
"output_cost_per_token": 0.00000028,
|
||||
|
@ -862,7 +886,7 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000010,
|
||||
"input_cost_per_token": 0.00000005,
|
||||
"output_cost_per_token": 0.00000010,
|
||||
"litellm_provider": "groq",
|
||||
"mode": "chat",
|
||||
|
@ -872,8 +896,8 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000064,
|
||||
"output_cost_per_token": 0.00000080,
|
||||
"input_cost_per_token": 0.00000059,
|
||||
"output_cost_per_token": 0.00000079,
|
||||
"litellm_provider": "groq",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true
|
||||
|
@ -991,6 +1015,18 @@
|
|||
"supports_vision": true,
|
||||
"tool_use_system_prompt_tokens": 159
|
||||
},
|
||||
"claude-3-5-sonnet-20240620": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"tool_use_system_prompt_tokens": 159
|
||||
},
|
||||
"text-bison": {
|
||||
"max_tokens": 1024,
|
||||
"max_input_tokens": 8192,
|
||||
|
@ -1155,30 +1191,42 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 32760,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.0000005,
|
||||
"input_cost_per_image": 0.0025,
|
||||
"input_cost_per_video_per_second": 0.002,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
|
||||
},
|
||||
"gemini-1.0-pro": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 32760,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.0000005,
|
||||
"input_cost_per_image": 0.0025,
|
||||
"input_cost_per_video_per_second": 0.002,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
|
||||
},
|
||||
"gemini-1.0-pro-001": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 32760,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.0000005,
|
||||
"input_cost_per_image": 0.0025,
|
||||
"input_cost_per_video_per_second": 0.002,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1188,8 +1236,12 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 32760,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.0000005,
|
||||
"input_cost_per_image": 0.0025,
|
||||
"input_cost_per_video_per_second": 0.002,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1199,14 +1251,157 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"input_cost_per_image": 0.001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_video_per_second": 0.001315,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"input_cost_per_character": 0.00000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.00001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.0000025,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"output_cost_per_character": 0.00000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.00003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.0000075,
|
||||
"output_cost_per_image": 0.00263,
|
||||
"output_cost_per_video_per_second": 0.00263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-001": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_image": 0.001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_video_per_second": 0.001315,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"input_cost_per_character": 0.00000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.00001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.0000025,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"output_cost_per_character": 0.00000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.00003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.0000075,
|
||||
"output_cost_per_image": 0.00263,
|
||||
"output_cost_per_video_per_second": 0.00263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0514": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_image": 0.001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_video_per_second": 0.001315,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"input_cost_per_character": 0.00000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.00001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.0000025,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"output_cost_per_character": 0.00000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.00003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.0000075,
|
||||
"output_cost_per_image": 0.00263,
|
||||
"output_cost_per_video_per_second": 0.00263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0215": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_image": 0.001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_video_per_second": 0.001315,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"input_cost_per_character": 0.00000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.00001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.0000025,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"output_cost_per_character": 0.00000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.00003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.0000075,
|
||||
"output_cost_per_image": 0.00263,
|
||||
"output_cost_per_video_per_second": 0.00263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0409": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_image": 0.001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_video_per_second": 0.001315,
|
||||
"input_cost_per_token": 0.000005,
|
||||
"input_cost_per_character": 0.00000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.00001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.0000025,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"output_cost_per_character": 0.00000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.00003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.0000075,
|
||||
"output_cost_per_image": 0.00263,
|
||||
"output_cost_per_video_per_second": 0.00263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-flash": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"max_images_per_prompt": 3000,
|
||||
"max_videos_per_prompt": 10,
|
||||
"max_video_length": 1,
|
||||
"max_audio_length_hours": 8.4,
|
||||
"max_audio_per_prompt": 1,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_image": 0.0001315,
|
||||
"input_cost_per_video_per_second": 0.0001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.00000025,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.00000075,
|
||||
"output_cost_per_image": 0.000263,
|
||||
"output_cost_per_video_per_second": 0.000263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-flash-001": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
|
@ -1217,10 +1412,23 @@
|
|||
"max_audio_length_hours": 8.4,
|
||||
"max_audio_per_prompt": 1,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_image": 0.0001315,
|
||||
"input_cost_per_video_per_second": 0.0001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.00000025,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.00000075,
|
||||
"output_cost_per_image": 0.000263,
|
||||
"output_cost_per_video_per_second": 0.000263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
|
@ -1235,62 +1443,27 @@
|
|||
"max_audio_length_hours": 8.4,
|
||||
"max_audio_per_prompt": 1,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_image": 0.0001315,
|
||||
"input_cost_per_video_per_second": 0.0001315,
|
||||
"input_cost_per_audio_per_second": 0.000125,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"input_cost_per_character": 0.000000125,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000001,
|
||||
"input_cost_per_character_above_128k_tokens": 0.00000025,
|
||||
"output_cost_per_token": 0.0000015,
|
||||
"output_cost_per_character": 0.000000375,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000003,
|
||||
"output_cost_per_character_above_128k_tokens": 0.00000075,
|
||||
"output_cost_per_image": 0.000263,
|
||||
"output_cost_per_video_per_second": 0.000263,
|
||||
"output_cost_per_audio_per_second": 0.00025,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-001": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0514": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0215": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-1.5-pro-preview-0409": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini-experimental": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
|
@ -1359,6 +1532,17 @@
|
|||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"vertex_ai/claude-3-5-sonnet@20240620": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"litellm_provider": "vertex_ai-anthropic_models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"vertex_ai/claude-3-haiku@20240307": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
|
@ -1538,6 +1722,27 @@
|
|||
"mode": "completion",
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini/gemini-1.5-flash": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"max_images_per_prompt": 3000,
|
||||
"max_videos_per_prompt": 10,
|
||||
"max_video_length": 1,
|
||||
"max_audio_length_hours": 8.4,
|
||||
"max_audio_per_prompt": 1,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
},
|
||||
"gemini/gemini-1.5-flash-latest": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
|
@ -1547,11 +1752,14 @@
|
|||
"max_video_length": 1,
|
||||
"max_audio_length_hours": 8.4,
|
||||
"max_audio_per_prompt": 1,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"max_pdf_size_mb": 30,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
|
@ -1560,8 +1768,10 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 32760,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0,
|
||||
"output_cost_per_token": 0.0,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1571,10 +1781,13 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
|
@ -1584,10 +1797,13 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1048576,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
|
@ -1597,8 +1813,10 @@
|
|||
"max_tokens": 2048,
|
||||
"max_input_tokens": 30720,
|
||||
"max_output_tokens": 2048,
|
||||
"input_cost_per_token": 0.0,
|
||||
"output_cost_per_token": 0.0,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1796,6 +2014,15 @@
|
|||
"litellm_provider": "replicate",
|
||||
"mode": "chat"
|
||||
},
|
||||
"openrouter/deepseek/deepseek-coder": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 32000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000014,
|
||||
"output_cost_per_token": 0.00000028,
|
||||
"litellm_provider": "openrouter",
|
||||
"mode": "chat"
|
||||
},
|
||||
"openrouter/microsoft/wizardlm-2-8x22b:nitro": {
|
||||
"max_tokens": 65536,
|
||||
"input_cost_per_token": 0.000001,
|
||||
|
@ -2349,6 +2576,17 @@
|
|||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"anthropic.claude-3-5-sonnet-20240620-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_vision": true
|
||||
},
|
||||
"anthropic.claude-3-haiku-20240307-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
|
@ -3377,6 +3615,24 @@
|
|||
"litellm_provider": "deepinfra",
|
||||
"mode": "chat"
|
||||
},
|
||||
"deepinfra/meta-llama/Meta-Llama-3-8B-Instruct": {
|
||||
"max_tokens": 8191,
|
||||
"max_input_tokens": 8191,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000008,
|
||||
"output_cost_per_token": 0.00000008,
|
||||
"litellm_provider": "deepinfra",
|
||||
"mode": "chat"
|
||||
},
|
||||
"deepinfra/meta-llama/Meta-Llama-3-70B-Instruct": {
|
||||
"max_tokens": 8191,
|
||||
"max_input_tokens": 8191,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000059,
|
||||
"output_cost_per_token": 0.00000079,
|
||||
"litellm_provider": "deepinfra",
|
||||
"mode": "chat"
|
||||
},
|
||||
"deepinfra/01-ai/Yi-34B-200K": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 200000,
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
|||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
|
||||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue