Merge branch 'BerriAI:main' into main

This commit is contained in:
samyxdev 2024-01-30 16:53:19 +01:00 committed by GitHub
commit fe9b511e45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
135 changed files with 17797 additions and 1333 deletions

View file

@ -42,6 +42,7 @@ jobs:
pip install "anyio==3.7.1" pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "apscheduler==3.10.4"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
- save_cache: - save_cache:
paths: paths:
@ -97,6 +98,43 @@ jobs:
command: | command: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install openai
python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install "google-generativeai>=0.3.2"
pip install "google-cloud-aiplatform>=1.38.0"
pip install "boto3>=1.28.57"
pip install langchain
pip install "langfuse>=2.0.0"
pip install numpydoc
pip install prisma
pip install "httpx==0.24.1"
pip install "gunicorn==21.2.0"
pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
# Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
command: docker build -t my-app:latest -f Dockerfile.database . command: docker build -t my-app:latest -f Dockerfile.database .
@ -106,15 +144,20 @@ jobs:
docker run -d \ docker run -d \
-p 4000:4000 \ -p 4000:4000 \
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \ -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
-e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_API_KEY=$AZURE_API_KEY \
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \ -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
--name my-app \ --name my-app \
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \ -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
my-app:latest \ my-app:latest \
--config /app/config.yaml \ --config /app/config.yaml \
--port 4000 \ --port 4000 \
--num_workers 8 --num_workers 8 \
--run_gunicorn \
--debug
- run: - run:
name: Install curl and dockerize name: Install curl and dockerize
command: | command: |
@ -125,63 +168,22 @@ jobs:
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run: - run:
name: Start outputting logs name: Start outputting logs
command: | command: docker logs -f my-app
while true; do
docker logs my-app
sleep 10
done
background: true background: true
- run: - run:
name: Wait for app to be ready name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m command: dockerize -wait http://localhost:4000 -timeout 1m
- run: - run:
name: Test the application name: Run tests
command: | command: |
mkdir -p /tmp/responses pwd
for i in {1..10}; do ls
status_file="/tmp/responses/status_${i}.txt" python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
response_file="/tmp/responses/response_${i}.json" no_output_timeout: 120m
(curl --location --request POST 'http://0.0.0.0:4000/key/generate' \ # Store test results
--header 'Authorization: Bearer sk-1234' \ - store_test_results:
--header 'Content-Type: application/json' \ path: test-results
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
--silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
# Capture PIDs of background processes
pids[${i}]=$!
done
# Wait for all background processes to finish
for pid in ${pids[*]}; do
wait $pid
done
# Check all responses and status codes
fail=false
for i in {1..10}; do
status=$(cat "/tmp/responses/status_${i}.txt")
# Here, we need to set the correct response file path for each iteration
response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
response=$(cat "${response_file}")
echo "Response ${i} (Status code: ${status}):"
echo "${response}" # Use echo here to print the contents
echo # Additional newline for readability
if [ "$status" -ne 200 ]; then
echo "A request did not return a 200 status code: $status"
fail=true
fi
done
# If any request did not return status code 200, fail the job
if [ "$fail" = true ]; then
exit 1
fi
echo "All requests returned a 200 status code."
publish_to_pypi: publish_to_pypi:
docker: docker:

View file

@ -41,6 +41,7 @@ jobs:
push: true push: true
file: Dockerfile.database file: Dockerfile.database
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }} tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
build-and-push-image: build-and-push-image:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
@ -74,7 +75,9 @@ jobs:
push: true push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest' tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
build-and-push-image-alpine: platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-ui:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
contents: read contents: read
@ -90,20 +93,21 @@ jobs:
username: ${{ github.actor }} username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Alpine Dockerfile - name: Extract metadata (tags, labels) for UI Dockerfile
id: meta-alpine id: meta-ui
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
- name: Build and push Alpine Docker image - name: Build and push UI Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with: with:
context: . context: ui/
file: Dockerfile.alpine file: ui/Dockerfile
push: true push: true
tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
labels: ${{ steps.meta-alpine.outputs.labels }} labels: ${{ steps.meta-ui.outputs.labels }}
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-database: build-and-push-image-database:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:

5
.gitignore vendored
View file

@ -35,3 +35,8 @@ hosted_config.yaml
litellm/proxy/tests/node_modules litellm/proxy/tests/node_modules
litellm/proxy/tests/package.json litellm/proxy/tests/package.json
litellm/proxy/tests/package-lock.json litellm/proxy/tests/package-lock.json
ui/litellm-dashboard/.next
ui/litellm-dashboard/node_modules
ui/litellm-dashboard/next-env.d.ts
ui/litellm-dashboard/package.json
ui/litellm-dashboard/package-lock.json

View file

@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
EXPOSE 4000/tcp EXPOSE 4000/tcp
ENTRYPOINT ["litellm"] ENTRYPOINT ["litellm"]
CMD ["--port", "4000"] CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]

View file

@ -56,4 +56,4 @@ EXPOSE 4000/tcp
# # Set your entrypoint and command # # Set your entrypoint and command
ENTRYPOINT ["litellm"] ENTRYPOINT ["litellm"]
CMD ["--port", "4000"] CMD ["--port", "4000", "--run_gunicorn"]

View file

@ -0,0 +1,34 @@
import os
from openai import OpenAI
from dotenv import load_dotenv
import httpx
import concurrent.futures
load_dotenv()
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
def create_chat_completion():
return client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test. Respond in 20 lines",
}
],
model="gpt-3.5-turbo",
)
with concurrent.futures.ThreadPoolExecutor() as executor:
# Set a timeout of 10 seconds
future = executor.submit(create_chat_completion)
try:
chat_completion = future.result(timeout=0.00001)
print(chat_completion)
except concurrent.futures.TimeoutError:
print("Operation timed out.")

View file

@ -0,0 +1,61 @@
# Notes - on how to do sagemaker streaming using boto3
import json
import boto3
import sys, os
import traceback
from dotenv import load_dotenv
load_dotenv()
import os, io
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
def __iter__(self):
return self
def __next__(self):
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
return line_data["token"]["text"]
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
payload = {
"inputs": "How do I build a website?",
"parameters": {"max_new_tokens": 256},
"stream": True,
}
import boto3
client = boto3.client("sagemaker-runtime", region_name="us-west-2")
response = client.invoke_endpoint_with_response_stream(
EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
Body=json.dumps(payload),
ContentType="application/json",
)
# for token in TokenIterator(response["Body"]):
# print(token)

View file

@ -1,12 +0,0 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any

15
docker-compose.yml Normal file
View file

@ -0,0 +1,15 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main-latest
volumes:
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
ports:
- "4000:4000"
environment:
- AZURE_API_KEY=sk-123
litellm-ui:
image: ghcr.io/berriai/litellm-ui:main-latest

View file

@ -204,6 +204,7 @@ def __init__(
s3_bucket_name: Optional[str] = None, s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None, s3_region_name: Optional[str] = None,
s3_api_version: Optional[str] = None, s3_api_version: Optional[str] = None,
s3_path: Optional[str] = None, # if you wish to save to a spefic path
s3_use_ssl: Optional[bool] = True, s3_use_ssl: Optional[bool] = True,
s3_verify: Optional[Union[bool, str]] = None, s3_verify: Optional[Union[bool, str]] = None,
s3_endpoint_url: Optional[str] = None, s3_endpoint_url: Optional[str] = None,

View file

@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'` - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. - `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
``` ```python
input=["good morning from litellm"] input=["good morning from litellm"]
``` ```
@ -22,7 +22,11 @@ input=["good morning from litellm"]
- `user`: *string (optional)* A unique identifier representing your end-user, - `user`: *string (optional)* A unique identifier representing your end-user,
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes). - `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `api_base`: *string (optional)* - The api endpoint you want to call the model with - `api_base`: *string (optional)* - The api endpoint you want to call the model with
@ -66,11 +70,18 @@ input=["good morning from litellm"]
from litellm import embedding from litellm import embedding
import os import os
os.environ['OPENAI_API_KEY'] = "" os.environ['OPENAI_API_KEY'] = ""
response = embedding('text-embedding-ada-002', input=["good morning from litellm"]) response = embedding(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
metadata={"anything": "good day"},
dimensions=5 # Only supported in text-embedding-3 and later models.
)
``` ```
| Model Name | Function Call | Required OS Variables | | Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------| |----------------------|---------------------------------------------|--------------------------------------|
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` | | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Embedding Models ## Azure OpenAI Embedding Models

View file

@ -28,6 +28,8 @@ import litellm
import os import os
os.environ["LANGSMITH_API_KEY"] = "" os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
# LLM API Keys # LLM API Keys
os.environ['OPENAI_API_KEY']="" os.environ['OPENAI_API_KEY']=""

View file

@ -6,7 +6,7 @@
# Gemini-Pro # Gemini-Pro
## Sample Usage ## Sample Usage
```python ```python
import litellm from litellm import completion
import os import os
os.environ['GEMINI_API_KEY'] = "" os.environ['GEMINI_API_KEY'] = ""

View file

@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call | | Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------| |-----------------------|-----------------------------------------------------------------|
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` | | gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` | | gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` | | gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |

View file

@ -5,7 +5,7 @@
## Sample Usage ## Sample Usage
```python ```python
import litellm from litellm import completion
import os import os
os.environ['PALM_API_KEY'] = "" os.environ['PALM_API_KEY'] = ""
@ -17,7 +17,7 @@ response = completion(
## Sample Usage - Streaming ## Sample Usage - Streaming
```python ```python
import litellm from litellm import completion
import os import os
os.environ['PALM_API_KEY'] = "" os.environ['PALM_API_KEY'] = ""

View file

@ -17,7 +17,28 @@ import litellm
litellm.vertex_project = "hardy-device-38811" # Your Project ID litellm.vertex_project = "hardy-device-38811" # Your Project ID
litellm.vertex_location = "us-central1" # proj location litellm.vertex_location = "us-central1" # proj location
response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]) response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
```
## OpenAI Proxy Usage
1. Modify the config.yaml
```yaml
litellm_settings:
vertex_project: "hardy-device-38811" # Your Project ID
vertex_location: "us-central1" # proj location
model_list:
-model_name: team1-gemini-pro
litellm_params:
model: gemini-pro
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
``` ```
## Set Vertex Project & Vertex Location ## Set Vertex Project & Vertex Location

View file

@ -11,7 +11,7 @@ pip install litellm vllm
```python ```python
import litellm import litellm
response = completion( response = litellm.completion(
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
messages=messages, messages=messages,
temperature=0.2, temperature=0.2,
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
```python ```python
import litellm import litellm
response = completion( response = litellm.completion(
model="openai/facebook/opt-125m", # pass the vllm model name model="openai/facebook/opt-125m", # pass the vllm model name
messages=messages, messages=messages,
api_base="https://hosted-vllm-api.co", api_base="https://hosted-vllm-api.co",

View file

@ -1,6 +1,13 @@
# Slack Alerting # Slack Alerting
Get alerts for failed db read/writes, hanging api calls, failed api calls. Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
## Quick Start ## Quick Start

View file

@ -1,4 +1,4 @@
# Modify Incoming Data # Modify / Reject Incoming Requests
Modify data just before making litellm completion calls call on proxy Modify data just before making litellm completion calls call on proxy

View file

@ -483,3 +483,55 @@ general_settings:
max_parallel_requests: 100 # max parallel requests for a user = 100 max_parallel_requests: 100 # max parallel requests for a user = 100
``` ```
## All settings
```python
{
"environment_variables": {},
"model_list": [
{
"model_name": "string",
"litellm_params": {},
"model_info": {
"id": "string",
"mode": "embedding",
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"max_tokens": 2048,
"base_model": "gpt-4-1106-preview",
"additionalProp1": {}
}
}
],
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
"general_settings": {
"completion_model": "string",
"key_management_system": "google_kms", # either google_kms or azure_kms
"master_key": "string",
"database_url": "string",
"database_type": "dynamo_db",
"database_args": {
"billing_mode": "PROVISIONED_THROUGHPUT",
"read_capacity_units": 0,
"write_capacity_units": 0,
"ssl_verify": true,
"region_name": "string",
"user_table_name": "LiteLLM_UserTable",
"key_table_name": "LiteLLM_VerificationToken",
"config_table_name": "LiteLLM_Config",
"spend_table_name": "LiteLLM_SpendLogs"
},
"otel": true,
"custom_auth": "string",
"max_parallel_requests": 0,
"infer_model_from_keys": true,
"background_health_checks": true,
"health_check_interval": 300,
"alerting": [
"string"
],
"alerting_threshold": 0
}
}
```

View file

@ -0,0 +1,115 @@
import Image from '@theme/IdealImage';
# Custom Pricing - Sagemaker, etc.
Use this to register custom pricing for models.
There's 2 ways to track cost:
- cost per token
- cost per second
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
## Quick Start
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
### Usage with OpenAI Proxy Server
**Step 1: Add pricing to config.yaml**
```yaml
model_list:
- model_name: sagemaker-completion-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_second: 0.000420
- model_name: sagemaker-embedding-model
litellm_params:
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
input_cost_per_second: 0.000420
```
**Step 2: Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3: View Spend Logs**
<Image img={require('../../img/spend_logs_table.png')} />
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
```yaml
model_list:
- model_name: azure-model
litellm_params:
model: azure/<your_deployment_name>
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
api_version: os.envrion/AZURE_API_VERSION
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
```

View file

@ -0,0 +1,34 @@
# Debugging
2 levels of debugging supported.
- debug (prints info logs)
- detailed debug (prints debug logs)
## `debug`
**via cli**
```bash
$ litellm --debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "INFO"
```
## `detailed debug`
**via cli**
```bash
$ litellm --detailed_debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "DEBUG"
```

View file

@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml
The proxy exposes: The proxy exposes:
* a /health endpoint which returns the health of the LLM APIs * a /health endpoint which returns the health of the LLM APIs
* a /test endpoint which makes a ping to the litellm server * a /health/readiness endpoint for returning if the proxy is ready to accept requests
* a /health/liveliness endpoint for returning if the proxy is alive
## `/health`
#### Request #### Request
Make a GET Request to `/health` on the proxy Make a GET Request to `/health` on the proxy
```shell ```shell
@ -39,7 +41,7 @@ litellm --health
} }
``` ```
## Background Health Checks ### Background Health Checks
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`. You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
curl --location 'http://0.0.0.0:8000/health' curl --location 'http://0.0.0.0:8000/health'
``` ```
## Embedding Models ### Embedding Models
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -77,7 +79,7 @@ model_list:
mode: embedding # 👈 ADD THIS mode: embedding # 👈 ADD THIS
``` ```
## Text Completion Models ### Text Completion Models
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -92,3 +94,54 @@ model_list:
model_info: model_info:
mode: completion # 👈 ADD THIS mode: completion # 👈 ADD THIS
``` ```
## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:8000/health/readiness'
```
Example Response:
*If proxy connected to a database*
```json
{
"status": "healthy",
"db": "connected",
"litellm_version":"1.19.2",
}
```
*If proxy not connected to a database*
```json
{
"status": "healthy",
"db": "Not connected",
"litellm_version":"1.19.2",
}
```
## `/health/liveliness`
Unprotected endpoint for checking if proxy is alive
Example Request:
```
curl -X 'GET' \
'http://0.0.0.0:8000/health/liveliness' \
-H 'accept: application/json'
```
Example Response:
```json
"I'm alive!"
```

View file

@ -1,5 +1,4 @@
# Multiple Instances of 1 model
# Load Balancing - Multiple Instances of 1 model
Load balance multiple instances of the same model Load balance multiple instances of the same model
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput** The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**

View file

@ -1,8 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# [BETA] Self-serve UI # [BETA] Admin UI
Allow your users to create their own keys through a UI
:::info :::info
@ -10,26 +8,17 @@ This is in beta, so things may change. If you have feedback, [let us know](https
::: :::
Allow your users to create, view their own keys through a UI
<Image img={require('../../img/admin_ui_2.png')} />
## Quick Start ## Quick Start
Requirements: ## 1. Changes to your config.yaml
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp)) Set `allow_user_auth: true` on your config
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
### Step 1. Save SMTP server credentials
```env
export SMTP_HOST="my-smtp-host"
export SMTP_USERNAME="my-smtp-password"
export SMTP_PASSWORD="my-smtp-password"
export SMTP_SENDER_EMAIL="krrish@berri.ai"
```
### Step 2. Enable user auth
In your config.yaml,
```yaml ```yaml
general_settings: general_settings:
@ -37,13 +26,36 @@ general_settings:
allow_user_auth: true allow_user_auth: true
``` ```
This will enable: ## 2. Setup Google SSO - Use this to Authenticate Team Members to the UI
* Users to create keys via `/key/generate` (by default, only admin can create keys) - Create an Oauth 2.0 Client
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id) <Image img={require('../../img/google_oauth2.png')} />
### Step 3. Connect to UI - Navigate to Google `Credenentials`
- Create a new Oauth client ID
- Set the `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` in your Proxy .env
- Set Redirect URL on your Oauth 2.0 Client
- Click on your Oauth 2.0 client on https://console.cloud.google.com/
- Set a redirect url = `<your proxy base url>/google-callback`
```
https://litellm-production-7002.up.railway.app/google-callback
```
<Image img={require('../../img/google_redirect.png')} />
## 3. Required env variables on your Proxy
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui). ```shell
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
# for Google SSO Login
GOOGLE_CLIENT_ID=
GOOGLE_CLIENT_SECRET=
```
## 4. Use UI
👉 Get Started here: https://litellm-dashboard.vercel.app/
<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`. If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
@ -63,3 +75,12 @@ Connect your proxy to your UI, by entering:
### Create Keys ### Create Keys
<Image img={require('../../img/user_create_key_screen.png')} /> <Image img={require('../../img/user_create_key_screen.png')} />
### Spend Per Key
<Image img={require('../../img/spend_per_api_key.png')} />
### Spend Per User
<Image img={require('../../img/spend_per_user.png')} /> -->

View file

@ -1,4 +1,7 @@
# 💰 Budgets, Rate Limits per user import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💰 Budgets, Rate Limits
Requirements: Requirements:
@ -6,17 +9,74 @@ Requirements:
## Set Budgets ## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. You can set budgets at 3 levels:
- For the proxy
- For a user
- For a key
<Tabs>
<TabItem value="proxy" label="For Proxy">
Apply a budget across all calls on the proxy
**Step 1. Modify config.yaml**
```yaml
general_settings:
master_key: sk-1234
litellm_settings:
# other litellm settings
max_budget: 0 # (float) sets max budget as $0 USD
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
**Step 2. Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
<TabItem value="per-user" label="For User">
Apply a budget across multiple keys.
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
You can:
- Add budgets to users [**Jump**](#add-budgets-to-users)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to users**
```shell ```shell
curl --location 'http://localhost:8000/user/new' \ curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \ --header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
``` ```
The request is a normal `/key/generate` request body + a `max_budget` field.
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Sample Response** **Sample Response**
@ -29,18 +89,163 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
} }
``` ```
### **Add budget duration to users**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
### Create new keys for existing user
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
```
</TabItem>
<TabItem value="per-key" label="For Key">
Apply a budget on a key.
You can:
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
**Expected Behaviour**
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
- If duration set, spend is reset at the end of the duration
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
}'
```
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <generated-key>' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
### **Add budget duration to keys**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
</TabItem>
</Tabs>
## Set Rate Limits ## Set Rate Limits
Set max parallel requests a user can make, when you create user keys - `/key/generate`. You can set:
- max parallel requests
- tpm limits
- rpm limits
<Tabs>
<TabItem value="per-user" label="Per User">
Use `/user/new`, to persist rate limits across multiple keys.
```shell
curl --location 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
```
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Expected Response**
```json
{
"key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
"expires": "2024-01-19T01:21:12.816168",
"user_id": "krrish@berri.ai",
}
```
</TabItem>
<TabItem value="per-key" label="Per Key">
Use `/key/generate`, if you want them for just that key.
```shell ```shell
curl --location 'http://0.0.0.0:8000/key/generate' \ curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \ --header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1 --data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
``` ```
**Expected Response**
```json
{
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
"expires": "2024-01-18T20:48:44.297973",
"user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84" // 👈 auto-generated
}
```
</TabItem>
</Tabs>
## Grant Access to new model ## Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.). Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).

View file

@ -1,4 +1,4 @@
# Key Management # Virtual Keys
Track Spend, Set budgets and create virtual keys for the proxy Track Spend, Set budgets and create virtual keys for the proxy
Grant other's temporary access to your proxy, with keys that expire after a set duration. Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
::: :::
## Quick Start ## Setup
Requirements: Requirements:
@ -58,36 +58,53 @@ litellm --config /path/to/config.yaml
curl 'http://0.0.0.0:8000/key/generate' \ curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \ --header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}' --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
``` ```
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ## /key/generate
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {} ### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"duration": "20m",
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
Expected response:
Request Params:
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
### Response
```python ```python
{ {
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
} }
``` ```
## Keys that don't expire ### Upgrade/Downgrade Models
Just set duration to None.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
```
## Upgrade/Downgrade Models
If a user is expected to use a given model (i.e. gpt3-5), and you want to: If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -137,7 +154,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) - **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
## Grant Access to new model ### Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.) Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
@ -165,6 +182,188 @@ curl --location 'http://localhost:8000/key/generate' \
"max_budget": 0,}' "max_budget": 0,}'
``` ```
## /key/info
### Request
```shell
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
Request Params:
- key: str - The key you want the info for
### Response
`token` is the hashed key (The DB stores the hashed key for security)
```json
{
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
"info": {
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
"spend": 0.0,
"expires": "2024-01-18T23:52:09.125000+00:00",
"models": ["azure-gpt-3.5", "azure-embedding-model"],
"aliases": {},
"config": {},
"user_id": "ishaan2@berri.ai",
"team_id": "None",
"max_parallel_requests": null,
"metadata": {}
}
}
```
## /key/update
### Request
```shell
curl 'http://0.0.0.0:8000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra"
}'
```
Request Params:
- key: str - The key that needs to be updated.
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
- team_id: str or null (optional) - Specify the team_id for the associated key.
### Response
```json
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {
"user": "ishaan@berri.ai"
}
}
```
## /key/delete
### Request
```shell
curl 'http://0.0.0.0:8000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}'
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
## Set Budgets - Per Key
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
#### Expected Behaviour
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
## Set Budgets - Per User
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Tracking Spend ## Tracking Spend
You can get spend for a key by using the `/key/info` endpoint. You can get spend for a key by using the `/key/info` endpoint.
@ -200,32 +399,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
``` ```
## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Custom Auth ## Custom Auth
You can now override the default api key auth. You can now override the default api key auth.
@ -275,6 +448,97 @@ general_settings:
$ litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
## Custom /key/generate
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
### 1. Write a custom `custom_generate_key_fn`
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
The output of your `custom_generate_key_fn` should be a dictionary with the following structure
```python
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
```python
async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
"""
Asynchronous function for generating a key based on the input data.
Args:
data (GenerateKeyRequest): The input data for key generation.
Returns:
dict: A dictionary containing the decision and an optional message.
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
"""
# decide if a key should be generated or not
print("using custom auth function!")
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
# only team_id="litellm-core-infra@gmail.com" can make keys
return {
"decision": True,
}
else:
print("Failed custom auth")
return {
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
```yaml
model_list:
- model_name: "openai-model"
litellm_params:
model: "gpt-3.5-turbo"
litellm_settings:
drop_params: True
set_verbose: True
general_settings:
custom_key_generate: custom_auth.custom_generate_key_fn
```
## [BETA] Dynamo DB ## [BETA] Dynamo DB

View file

@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
**Global Timeouts**
```python ```python
from litellm import Router from litellm import Router
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
print(response) print(response)
``` ```
**Timeouts per model**
```python
from litellm import Router
import asyncio
model_list = [{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"timeout": 300 # sets a 5 minute timeout
"stream_timeout": 30 # sets a 30s timeout for streaming calls
}
}]
# init router
router = Router(model_list=model_list, routing_strategy="least-busy")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
### Cooldowns ### Cooldowns
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
print(f"response: {response}") print(f"response: {response}")
``` ```
## Custom Callbacks - Track API Key, API Endpoint, Model Used
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
### Usage
```python
import litellm
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
print("kwargs=", kwargs)
litellm_params= kwargs.get("litellm_params")
api_key = litellm_params.get("api_key")
api_base = litellm_params.get("api_base")
custom_llm_provider= litellm_params.get("custom_llm_provider")
response_cost = kwargs.get("response_cost")
# print the values
print("api_key=", api_key)
print("api_base=", api_base)
print("custom_llm_provider=", custom_llm_provider)
print("response_cost=", response_cost)
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
print("kwargs=")
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
# Init Router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
# router completion call
response = router.completion(
model="gpt-3.5-turbo",
messages=[{ "role": "user", "content": "Hi who are you"}]
)
```
## Deploy Router ## Deploy Router
@ -602,17 +676,63 @@ def __init__(
num_retries: int = 0, num_retries: int = 0,
timeout: Optional[float] = None, timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create default_litellm_params={}, # default params for Router.chat.completion.create
set_verbose: bool = False,
fallbacks: List = [], fallbacks: List = [],
allowed_fails: Optional[int] = None, allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
context_window_fallbacks: List = [], context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {}, model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request retry_after: int = 0, # (min) time to wait before retrying a failed request
routing_strategy: Literal[ routing_strategy: Literal[
"simple-shuffle", "simple-shuffle",
"least-busy", "least-busy",
"usage-based-routing", "usage-based-routing",
"latency-based-routing", "latency-based-routing",
] = "simple-shuffle", ] = "simple-shuffle",
## DEBUGGING ##
set_verbose: bool = False, # set this to True for seeing logs
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
): ):
``` ```
## Debugging Router
### Basic Debugging
Set `Router(set_verbose=True)`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True
)
```
### Detailed Debugging
Set `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```
### Very Detailed Debugging
Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
import litellm
litellm.set_verbose = True
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 249 KiB

View file

@ -104,24 +104,49 @@ const sidebars = {
items: [ items: [
"proxy/quick_start", "proxy/quick_start",
"proxy/configs", "proxy/configs",
{
type: 'link',
label: '📖 All Endpoints',
href: 'https://litellm-api.up.railway.app/',
},
"proxy/user_keys", "proxy/user_keys",
"proxy/load_balancing",
"proxy/virtual_keys", "proxy/virtual_keys",
"proxy/users", "proxy/users",
"proxy/ui", "proxy/ui",
"proxy/model_management", "proxy/model_management",
"proxy/reliability",
"proxy/caching",
"proxy/logging",
"proxy/health", "proxy/health",
"proxy/call_hooks", "proxy/debugging",
"proxy/rules", {
"proxy/alerting", "type": "category",
"proxy/streaming_logging", "label": "🔥 Load Balancing",
"items": [
"proxy/load_balancing",
"proxy/reliability",
]
},
{
"type": "category",
"label": "Logging, Alerting, Caching",
"items": [
"proxy/logging",
"proxy/alerting",
"proxy/streaming_logging",
"proxy/caching",
]
},
{
"type": "category",
"label": "Admin Controls",
"items": [
"proxy/call_hooks",
"proxy/rules",
]
},
"proxy/deploy", "proxy/deploy",
"proxy/cli", "proxy/cli",
] ]
}, },
"proxy/custom_pricing",
"routing", "routing",
"rules", "rules",
"set_keys", "set_keys",

View file

@ -2,10 +2,14 @@
import threading, requests import threading, requests
from typing import Callable, List, Optional, Dict, Union, Any from typing import Callable, List, Optional, Dict, Union, Any
from litellm.caching import Cache from litellm.caching import Cache
from litellm._logging import set_verbose from litellm._logging import set_verbose, _turn_on_debug
from litellm.proxy._types import KeyManagementSystem from litellm.proxy._types import KeyManagementSystem
import httpx import httpx
#############################################
if set_verbose == True:
_turn_on_debug()
#############################################
input_callback: List[Union[str, Callable]] = [] input_callback: List[Union[str, Callable]] = []
success_callback: List[Union[str, Callable]] = [] success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = [] failure_callback: List[Union[str, Callable]] = []
@ -58,6 +62,9 @@ cache: Optional[
model_alias_map: Dict[str, str] = {} model_alias_map: Dict[str, str] = {}
model_group_alias_map: Dict[str, str] = {} model_group_alias_map: Dict[str, str] = {}
max_budget: float = 0.0 # set the max budget across all providers max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[
str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
_openai_completion_params = [ _openai_completion_params = [
"functions", "functions",
"function_call", "function_call",
@ -136,6 +143,7 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
suppress_debug_info = False suppress_debug_info = False
dynamodb_table_name: Optional[str] = None dynamodb_table_name: Optional[str] = None
s3_callback_params: Optional[Dict] = None s3_callback_params: Optional[Dict] = None
default_key_generate_params: Optional[Dict] = None
#### RELIABILITY #### #### RELIABILITY ####
request_timeout: Optional[float] = 6000 request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint num_retries: Optional[int] = None # per model endpoint

View file

@ -7,20 +7,14 @@ handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG) handler.setLevel(logging.DEBUG)
# Create a formatter and set it for the handler # Create a formatter and set it for the handler
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
datefmt="%H:%M:%S",
)
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
handler.setFormatter(formatter) handler.setFormatter(formatter)
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy") verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
verbose_router_logger = logging.getLogger("LiteLLM Router") verbose_router_logger = logging.getLogger("LiteLLM Router")
verbose_logger = logging.getLogger("LiteLLM") verbose_logger = logging.getLogger("LiteLLM")
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
# Add the handler to the logger # Add the handler to the logger
verbose_router_logger.addHandler(handler) verbose_router_logger.addHandler(handler)
verbose_proxy_logger.addHandler(handler) verbose_proxy_logger.addHandler(handler)
verbose_logger.addHandler(handler)
def _turn_on_debug():
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to debug
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass

View file

@ -1,3 +1,12 @@
# +-----------------------------------------------+
# | |
# | NOT PROXY BUDGET MANAGER |
# | proxy budget manager is in proxy_server.py |
# | |
# +-----------------------------------------------+
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import os, json, time import os, json, time
import litellm import litellm
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
@ -11,10 +20,12 @@ class BudgetManager:
project_name: str, project_name: str,
client_type: str = "local", client_type: str = "local",
api_base: Optional[str] = None, api_base: Optional[str] = None,
headers: Optional[dict] = None,
): ):
self.client_type = client_type self.client_type = client_type
self.project_name = project_name self.project_name = project_name
self.api_base = api_base or "https://api.litellm.ai" self.api_base = api_base or "https://api.litellm.ai"
self.headers = headers or {"Content-Type": "application/json"}
## load the data or init the initial dictionaries ## load the data or init the initial dictionaries
self.load_data() self.load_data()
@ -43,7 +54,7 @@ class BudgetManager:
url = self.api_base + "/get_budget" url = self.api_base + "/get_budget"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name} data = {"project_name": self.project_name}
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=self.headers, json=data)
response = response.json() response = response.json()
if response["status"] == "error": if response["status"] == "error":
self.user_dict = ( self.user_dict = (
@ -201,6 +212,6 @@ class BudgetManager:
url = self.api_base + "/set_budget" url = self.api_base + "/set_budget"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name, "user_dict": self.user_dict} data = {"project_name": self.project_name, "user_dict": self.user_dict}
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=self.headers, json=data)
response = response.json() response = response.json()
return response return response

View file

@ -12,10 +12,12 @@ import time, logging
import json, traceback, ast, hashlib import json, traceback, ast, hashlib
from typing import Optional, Literal, List, Union, Any from typing import Optional, Literal, List, Union, Any
from openai._models import BaseModel as OpenAIObject from openai._models import BaseModel as OpenAIObject
from litellm._logging import verbose_logger
def print_verbose(print_statement): def print_verbose(print_statement):
try: try:
verbose_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose:
print(print_statement) # noqa print(print_statement) # noqa
except: except:
@ -129,11 +131,13 @@ class S3Cache(BaseCache):
s3_aws_secret_access_key=None, s3_aws_secret_access_key=None,
s3_aws_session_token=None, s3_aws_session_token=None,
s3_config=None, s3_config=None,
s3_path=None,
**kwargs, **kwargs,
): ):
import boto3 import boto3
self.bucket_name = s3_bucket_name self.bucket_name = s3_bucket_name
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
# Create an S3 client with custom endpoint URL # Create an S3 client with custom endpoint URL
self.s3_client = boto3.client( self.s3_client = boto3.client(
"s3", "s3",
@ -155,6 +159,8 @@ class S3Cache(BaseCache):
ttl = kwargs.get("ttl", None) ttl = kwargs.get("ttl", None)
# Convert value to JSON before storing in S3 # Convert value to JSON before storing in S3
serialized_value = json.dumps(value) serialized_value = json.dumps(value)
key = self.key_prefix + key
if ttl is not None: if ttl is not None:
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}" cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
import datetime import datetime
@ -171,7 +177,7 @@ class S3Cache(BaseCache):
CacheControl=cache_control, CacheControl=cache_control,
ContentType="application/json", ContentType="application/json",
ContentLanguage="en", ContentLanguage="en",
ContentDisposition=f"inline; filename=\"{key}.json\"" ContentDisposition=f'inline; filename="{key}.json"',
) )
else: else:
cache_control = "immutable, max-age=31536000, s-maxage=31536000" cache_control = "immutable, max-age=31536000, s-maxage=31536000"
@ -183,7 +189,7 @@ class S3Cache(BaseCache):
CacheControl=cache_control, CacheControl=cache_control,
ContentType="application/json", ContentType="application/json",
ContentLanguage="en", ContentLanguage="en",
ContentDisposition=f"inline; filename=\"{key}.json\"" ContentDisposition=f'inline; filename="{key}.json"',
) )
except Exception as e: except Exception as e:
# NON blocking - notify users S3 is throwing an exception # NON blocking - notify users S3 is throwing an exception
@ -193,6 +199,8 @@ class S3Cache(BaseCache):
import boto3, botocore import boto3, botocore
try: try:
key = self.key_prefix + key
print_verbose(f"Get S3 Cache: key: {key}") print_verbose(f"Get S3 Cache: key: {key}")
# Download the data from S3 # Download the data from S3
cached_response = self.s3_client.get_object( cached_response = self.s3_client.get_object(

View file

@ -8,6 +8,7 @@ from datetime import datetime
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
from packaging.version import Version from packaging.version import Version
from litellm._logging import verbose_logger
class LangFuseLogger: class LangFuseLogger:
@ -93,6 +94,7 @@ class LangFuseLogger:
print_verbose( print_verbose(
f"Langfuse Layer Logging - final response object: {response_obj}" f"Langfuse Layer Logging - final response object: {response_obj}"
) )
verbose_logger.info(f"Langfuse Layer Logging - logging success")
except: except:
traceback.print_exc() traceback.print_exc()
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}") print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
@ -181,6 +183,8 @@ class LangFuseLogger:
if supports_tags: if supports_tags:
for key, value in metadata.items(): for key, value in metadata.items():
tags.append(f"{key}:{value}") tags.append(f"{key}:{value}")
if "cache_hit" in kwargs:
tags.append(f"cache_hit:{kwargs['cache_hit']}")
trace_params.update({"tags": tags}) trace_params.update({"tags": tags})
trace = self.Langfuse.trace(**trace_params) trace = self.Langfuse.trace(**trace_params)

View file

@ -13,19 +13,22 @@ class LangsmithLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self): def __init__(self):
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY") self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
self.langsmith_default_run_name = os.getenv(
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
)
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
# Method definition # Method definition
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
metadata = {} metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {} # if metadata is None
if "litellm_params" in kwargs:
metadata = kwargs["litellm_params"].get("metadata", {})
# set project name and run_name for langsmith logging # set project name and run_name for langsmith logging
# users can pass project_name and run name to litellm.completion() # users can pass project_name and run name to litellm.completion()
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"}) # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
# if not set litellm will use default project_name = litellm-completion, run_name = LLMRun # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
project_name = metadata.get("project_name", "litellm-completion") project_name = metadata.get("project_name", self.langsmith_project)
run_name = metadata.get("run_name", "LLMRun") run_name = metadata.get("run_name", self.langsmith_default_run_name)
print_verbose( print_verbose(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}" f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
) )

View file

@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import datetime, subprocess, sys import datetime, subprocess, sys
import litellm, uuid import litellm, uuid
from litellm._logging import print_verbose from litellm._logging import print_verbose, verbose_logger
class S3Logger: class S3Logger:
@ -16,6 +16,7 @@ class S3Logger:
def __init__( def __init__(
self, self,
s3_bucket_name=None, s3_bucket_name=None,
s3_path=None,
s3_region_name=None, s3_region_name=None,
s3_api_version=None, s3_api_version=None,
s3_use_ssl=True, s3_use_ssl=True,
@ -30,7 +31,9 @@ class S3Logger:
import boto3 import boto3
try: try:
print_verbose("in init s3 logger") verbose_logger.debug(
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
)
if litellm.s3_callback_params is not None: if litellm.s3_callback_params is not None:
# read in .env variables - example os.environ/AWS_BUCKET_NAME # read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -41,7 +44,7 @@ class S3Logger:
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name") s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
s3_region_name = litellm.s3_callback_params.get("s3_region_name") s3_region_name = litellm.s3_callback_params.get("s3_region_name")
s3_api_version = litellm.s3_callback_params.get("s3_api_version") s3_api_version = litellm.s3_callback_params.get("s3_api_version")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl") s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
s3_verify = litellm.s3_callback_params.get("s3_verify") s3_verify = litellm.s3_callback_params.get("s3_verify")
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url") s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
s3_aws_access_key_id = litellm.s3_callback_params.get( s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -57,6 +60,8 @@ class S3Logger:
# done reading litellm.s3_callback_params # done reading litellm.s3_callback_params
self.bucket_name = s3_bucket_name self.bucket_name = s3_bucket_name
self.s3_path = s3_path
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
# Create an S3 client with custom endpoint URL # Create an S3 client with custom endpoint URL
self.s3_client = boto3.client( self.s3_client = boto3.client(
"s3", "s3",
@ -82,7 +87,9 @@ class S3Logger:
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
try: try:
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}") verbose_logger.debug(
f"s3 Logging - Enters logging function for model {kwargs}"
)
# construct payload to send to s3 # construct payload to send to s3
# follows the same params as langfuse.py # follows the same params as langfuse.py
@ -122,8 +129,12 @@ class S3Logger:
pass pass
s3_object_key = ( s3_object_key = (
payload["id"] + "-time=" + str(start_time) (self.s3_path.rstrip("/") + "/" if self.s3_path else "")
+ payload["id"]
+ "-time="
+ str(start_time)
) # we need the s3 key to include the time, so we log cache hits too ) # we need the s3 key to include the time, so we log cache hits too
s3_object_key += ".json"
import json import json
@ -146,5 +157,5 @@ class S3Logger:
return response return response
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}") verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
pass pass

View file

@ -78,7 +78,7 @@ class AnthropicConfig:
# makes headers for API call # makes headers for API call
def validate_environment(api_key): def validate_environment(api_key, user_headers):
if api_key is None: if api_key is None:
raise ValueError( raise ValueError(
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params" "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
@ -89,6 +89,8 @@ def validate_environment(api_key):
"content-type": "application/json", "content-type": "application/json",
"x-api-key": api_key, "x-api-key": api_key,
} }
if user_headers is not None and isinstance(user_headers, dict):
headers = {**headers, **user_headers}
return headers return headers
@ -105,8 +107,9 @@ def completion(
optional_params=None, optional_params=None,
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,
headers={},
): ):
headers = validate_environment(api_key) headers = validate_environment(api_key, headers)
if model in custom_prompt_dict: if model in custom_prompt_dict:
# check if the model has a registered custom prompt # check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model] model_prompt_details = custom_prompt_dict[model]
@ -139,7 +142,11 @@ def completion(
logging_obj.pre_call( logging_obj.pre_call(
input=prompt, input=prompt,
api_key=api_key, api_key=api_key,
additional_args={"complete_input_dict": data, "api_base": api_base}, additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
) )
## COMPLETION CALL ## COMPLETION CALL

View file

@ -629,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
client_session = litellm.aclient_session or httpx.AsyncClient( client_session = litellm.aclient_session or httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(),
) )
openai_aclient = AsyncAzureOpenAI( azure_client = AsyncAzureOpenAI(
http_client=client_session, **azure_client_params http_client=client_session, **azure_client_params
) )
else: else:
openai_aclient = client azure_client = client
response = await openai_aclient.images.generate(**data, timeout=timeout) ## LOGGING
logging_obj.pre_call(
input=data["prompt"],
api_key=azure_client.api_key,
additional_args={
"headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference,
"acompletion": True,
"complete_input_dict": data,
},
)
response = await azure_client.images.generate(**data, timeout=timeout)
stringified_response = response.model_dump() stringified_response = response.model_dump()
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
@ -719,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
input=prompt, input=prompt,
api_key=azure_client.api_key, api_key=azure_client.api_key,
additional_args={ additional_args={
"headers": {"Authorization": f"Bearer {azure_client.api_key}"}, "headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference, "api_base": azure_client._base_url._uri_reference,
"acompletion": False, "acompletion": False,
"complete_input_dict": data, "complete_input_dict": data,

View file

@ -659,9 +659,16 @@ def completion(
) )
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = response_metadata.get(
completion_tokens = len( "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
encoding.encode(model_response["choices"][0]["message"].get("content", "")) )
completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count",
len(
encoding.encode(
model_response["choices"][0]["message"].get("content", "")
)
),
) )
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
@ -672,6 +679,8 @@ def completion(
total_tokens=prompt_tokens + completion_tokens, total_tokens=prompt_tokens + completion_tokens,
) )
model_response.usage = usage model_response.usage = usage
model_response._hidden_params["region_name"] = client.meta.region_name
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
return model_response return model_response
except BedrockError as e: except BedrockError as e:
exception_mapping_worked = True exception_mapping_worked = True

View file

@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
request=request, request=request,
) )
time.sleep(int(response.headers.get("retry-after")) or 10) await asyncio.sleep(int(response.headers.get("retry-after") or 10))
response = await super().handle_async_request(request) response = await super().handle_async_request(request)
await response.aread() await response.aread()
@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
request.method = "GET" request.method = "GET"
response = super().handle_request(request) response = super().handle_request(request)
response.read() response.read()
timeout_secs: int = 120 timeout_secs: int = 120
start_time = time.time() start_time = time.time()
while response.json()["status"] not in ["succeeded", "failed"]: while response.json()["status"] not in ["succeeded", "failed"]:
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
content=json.dumps(timeout).encode("utf-8"), content=json.dumps(timeout).encode("utf-8"),
request=request, request=request,
) )
time.sleep(int(response.headers.get("retry-after", None) or 10))
time.sleep(int(response.headers.get("retry-after")) or 10)
response = super().handle_request(request) response = super().handle_request(request)
response.read() response.read()
if response.json()["status"] == "failed": if response.json()["status"] == "failed":
error_data = response.json() error_data = response.json()
return httpx.Response( return httpx.Response(

View file

@ -120,9 +120,7 @@ def completion(
## Load Config ## Load Config
inference_params = copy.deepcopy(optional_params) inference_params = copy.deepcopy(optional_params)
inference_params.pop( stream = inference_params.pop("stream", None)
"stream", None
) # palm does not support streaming, so we handle this by fake streaming in main.py
config = litellm.GeminiConfig.get_config() config = litellm.GeminiConfig.get_config()
for k, v in config.items(): for k, v in config.items():
if ( if (
@ -139,10 +137,18 @@ def completion(
## COMPLETION CALL ## COMPLETION CALL
try: try:
_model = genai.GenerativeModel(f"models/{model}") _model = genai.GenerativeModel(f"models/{model}")
response = _model.generate_content( if stream != True:
contents=prompt, response = _model.generate_content(
generation_config=genai.types.GenerationConfig(**inference_params), contents=prompt,
) generation_config=genai.types.GenerationConfig(**inference_params),
)
else:
response = _model.generate_content(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
stream=True,
)
return response
except Exception as e: except Exception as e:
raise GeminiError( raise GeminiError(
message=str(e), message=str(e),
@ -184,9 +190,13 @@ def completion(
if hasattr(response, "candidates"): if hasattr(response, "candidates"):
original_response = f"response: {response.candidates}" original_response = f"response: {response.candidates}"
if "SAFETY" in original_response: if "SAFETY" in original_response:
original_response += "\nThe candidate content was flagged for safety reasons." original_response += (
"\nThe candidate content was flagged for safety reasons."
)
elif "RECITATION" in original_response: elif "RECITATION" in original_response:
original_response += "\nThe candidate content was flagged for recitation reasons." original_response += (
"\nThe candidate content was flagged for recitation reasons."
)
raise GeminiError( raise GeminiError(
status_code=400, status_code=400,
message=f"No response received. Original response - {original_response}", message=f"No response received. Original response - {original_response}",

View file

@ -220,8 +220,10 @@ def get_ollama_response(
model_response["choices"][0]["message"] = response_json["message"] model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model model_response["model"] = "ollama/" + model
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
model_response["choices"][0]["message"] = response_json["message"] model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"] model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,

View file

@ -1,5 +1,5 @@
from typing import Optional, Union, Any from typing import Optional, Union, Any
import types, time, json import types, time, json, traceback
import httpx import httpx
from .base import BaseLLM from .base import BaseLLM
from litellm.utils import ( from litellm.utils import (
@ -349,7 +349,7 @@ class OpenAIChatCompletion(BaseLLM):
if hasattr(e, "status_code"): if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e)) raise OpenAIError(status_code=e.status_code, message=str(e))
else: else:
raise OpenAIError(status_code=500, message=str(e)) raise OpenAIError(status_code=500, message=traceback.format_exc())
async def acompletion( async def acompletion(
self, self,
@ -706,19 +706,34 @@ class OpenAIChatCompletion(BaseLLM):
## COMPLETION CALL ## COMPLETION CALL
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
response = response.model_dump() # type: ignore
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
input=input, input=prompt,
api_key=api_key, api_key=api_key,
additional_args={"complete_input_dict": data}, additional_args={"complete_input_dict": data},
original_response=response, original_response=response,
) )
# return response # return response
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e: except OpenAIError as e:
exception_mapping_worked = True exception_mapping_worked = True
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
raise e raise e
except Exception as e: except Exception as e:
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
if hasattr(e, "status_code"): if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e)) raise OpenAIError(status_code=e.status_code, message=str(e))
else: else:

View file

@ -99,12 +99,16 @@ def ollama_pt(
def mistral_instruct_pt(messages): def mistral_instruct_pt(messages):
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
prompt = custom_prompt( prompt = custom_prompt(
initial_prompt_value="<s>", initial_prompt_value="<s>",
role_dict={ role_dict={
"system": {"pre_message": "[INST]", "post_message": "[/INST]"}, "system": {
"user": {"pre_message": "[INST]", "post_message": "[/INST]"}, "pre_message": "[INST] \n",
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"}, "post_message": " [/INST]\n",
},
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
"assistant": {"pre_message": " ", "post_message": " "},
}, },
final_prompt_value="</s>", final_prompt_value="</s>",
messages=messages, messages=messages,
@ -372,6 +376,7 @@ def anthropic_pt(
You can "put words in Claude's mouth" by ending with an assistant message. You can "put words in Claude's mouth" by ending with an assistant message.
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
""" """
class AnthropicConstants(Enum): class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman: " HUMAN_PROMPT = "\n\nHuman: "
AI_PROMPT = "\n\nAssistant: " AI_PROMPT = "\n\nAssistant: "
@ -399,27 +404,30 @@ def _load_image_from_url(image_url):
try: try:
from PIL import Image from PIL import Image
except: except:
raise Exception("gemini image conversion failed please run `pip install Pillow`") raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
from io import BytesIO from io import BytesIO
try: try:
# Send a GET request to the image URL # Send a GET request to the image URL
response = requests.get(image_url) response = requests.get(image_url)
response.raise_for_status() # Raise an exception for HTTP errors response.raise_for_status() # Raise an exception for HTTP errors
# Check the response's content type to ensure it is an image # Check the response's content type to ensure it is an image
content_type = response.headers.get('content-type') content_type = response.headers.get("content-type")
if not content_type or 'image' not in content_type: if not content_type or "image" not in content_type:
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})") raise ValueError(
f"URL does not point to a valid image (content-type: {content_type})"
)
# Load the image from the response content # Load the image from the response content
return Image.open(BytesIO(response.content)) return Image.open(BytesIO(response.content))
except requests.RequestException as e: except requests.RequestException as e:
print(f"Request failed: {e}") raise Exception(f"Request failed: {e}")
except UnidentifiedImageError: except Exception as e:
print("Cannot identify image file (it may not be a supported image format or might be corrupted).") raise e
except ValueError as e:
print(e)
def _gemini_vision_convert_messages(messages: list): def _gemini_vision_convert_messages(messages: list):
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
try: try:
from PIL import Image from PIL import Image
except: except:
raise Exception("gemini image conversion failed please run `pip install Pillow`") raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
try: try:
# given messages for gpt-4 vision, convert them for gemini # given messages for gpt-4 vision, convert them for gemini
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb # https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
prompt = "" prompt = ""
@ -589,7 +598,7 @@ def prompt_factory(
if custom_llm_provider == "ollama": if custom_llm_provider == "ollama":
return ollama_pt(model=model, messages=messages) return ollama_pt(model=model, messages=messages)
elif custom_llm_provider == "anthropic": elif custom_llm_provider == "anthropic":
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]): if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
return claude_2_1_pt(messages=messages) return claude_2_1_pt(messages=messages)
else: else:
return anthropic_pt(messages=messages) return anthropic_pt(messages=messages)

View file

@ -25,6 +25,46 @@ class SagemakerError(Exception):
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
self.end_of_data = False
def __iter__(self):
return self
def __next__(self):
try:
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
response_obj = {"text": "", "is_finished": False}
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
if line_data.get("generated_text", None) is not None:
self.end_of_data = True
response_obj["is_finished"] = True
response_obj["text"] = line_data["token"]["text"]
return response_obj
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
except StopIteration as e:
if self.end_of_data == True:
raise e # Re-raise StopIteration
else:
self.end_of_data = True
return "data: [DONE]"
class SagemakerConfig: class SagemakerConfig:
""" """
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
@ -121,7 +161,6 @@ def completion(
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
inference_params = deepcopy(optional_params) inference_params = deepcopy(optional_params)
inference_params.pop("stream", None)
## Load Config ## Load Config
config = litellm.SagemakerConfig.get_config() config = litellm.SagemakerConfig.get_config()
@ -152,6 +191,28 @@ def completion(
hf_model_name or model hf_model_name or model
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt) ) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
prompt = prompt_factory(model=hf_model_name, messages=messages) prompt = prompt_factory(model=hf_model_name, messages=messages)
stream = inference_params.pop("stream", None)
if stream == True:
data = json.dumps(
{"inputs": prompt, "parameters": inference_params, "stream": True}
).encode("utf-8")
## LOGGING
request_str = f"""
response = client.invoke_endpoint_with_response_stream(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
return response["Body"]
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode( data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
"utf-8" "utf-8"

View file

View file

@ -10,12 +10,11 @@
import os, openai, sys, json, inspect, uuid, datetime, threading import os, openai, sys, json, inspect, uuid, datetime, threading
from typing import Any, Literal, Union from typing import Any, Literal, Union
from functools import partial from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy from copy import deepcopy
import httpx import httpx
import litellm import litellm
from ._logging import verbose_logger
from litellm import ( # type: ignore from litellm import ( # type: ignore
client, client,
exception_type, exception_type,
@ -83,6 +82,7 @@ from litellm.utils import (
TextCompletionResponse, TextCompletionResponse,
TextChoices, TextChoices,
EmbeddingResponse, EmbeddingResponse,
ImageResponse,
read_config_args, read_config_args,
Choices, Choices,
Message, Message,
@ -273,14 +273,10 @@ async def acompletion(
else: else:
# Call the synchronous function using run_in_executor # Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context) # type: ignore response = await loop.run_in_executor(None, func_with_context) # type: ignore
# if kwargs.get("stream", False): # return an async generator if isinstance(response, CustomStreamWrapper):
# return _async_streaming( response.set_logging_event_loop(
# response=response, loop=loop
# model=model, ) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
# custom_llm_provider=custom_llm_provider,
# args=args,
# )
# else:
return response return response
except Exception as e: except Exception as e:
custom_llm_provider = custom_llm_provider or "openai" custom_llm_provider = custom_llm_provider or "openai"
@ -343,6 +339,18 @@ def mock_completion(
model_response["choices"][0]["message"]["content"] = mock_response model_response["choices"][0]["message"]["content"] = mock_response
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = model model_response["model"] = model
model_response.usage = Usage(
prompt_tokens=10, completion_tokens=20, total_tokens=30
)
try:
_, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
except:
# dont let setting a hidden param block a mock_respose
pass
return model_response return model_response
except: except:
@ -445,6 +453,8 @@ def completion(
### CUSTOM MODEL COST ### ### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None) input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None) output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
### CUSTOM PROMPT TEMPLATE ### ### CUSTOM PROMPT TEMPLATE ###
initial_prompt_value = kwargs.get("initial_prompt_value", None) initial_prompt_value = kwargs.get("initial_prompt_value", None)
roles = kwargs.get("roles", None) roles = kwargs.get("roles", None)
@ -522,6 +532,8 @@ def completion(
"tpm", "tpm",
"input_cost_per_token", "input_cost_per_token",
"output_cost_per_token", "output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name", "hf_model_name",
"model_info", "model_info",
"proxy_server_request", "proxy_server_request",
@ -534,10 +546,6 @@ def completion(
non_default_params = { non_default_params = {
k: v for k, v in kwargs.items() if k not in default_params k: v for k, v in kwargs.items() if k not in default_params
} # model-specific params - pass them straight to the model/provider } # model-specific params - pass them straight to the model/provider
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if timeout is None: if timeout is None:
timeout = ( timeout = (
kwargs.get("request_timeout", None) or 600 kwargs.get("request_timeout", None) or 600
@ -577,6 +585,10 @@ def completion(
) )
if model_response is not None and hasattr(model_response, "_hidden_params"): if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
model_response._hidden_params["region_name"] = kwargs.get(
"aws_region_name", None
) # support region-based pricing for bedrock
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ### ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None: if input_cost_per_token is not None and output_cost_per_token is not None:
litellm.register_model( litellm.register_model(
@ -588,6 +600,19 @@ def completion(
} }
} }
) )
if (
input_cost_per_second is not None
): # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
}
}
)
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ### ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
custom_prompt_dict = {} # type: ignore custom_prompt_dict = {} # type: ignore
if ( if (
@ -674,6 +699,10 @@ def completion(
optional_params=optional_params, optional_params=optional_params,
litellm_params=litellm_params, litellm_params=litellm_params,
) )
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if custom_llm_provider == "azure": if custom_llm_provider == "azure":
# azure configs # azure configs
api_type = get_secret("AZURE_API_TYPE") or "azure" api_type = get_secret("AZURE_API_TYPE") or "azure"
@ -692,9 +721,9 @@ def completion(
or get_secret("AZURE_API_KEY") or get_secret("AZURE_API_KEY")
) )
azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret( azure_ad_token = optional_params.get("extra_body", {}).pop(
"AZURE_AD_TOKEN" "azure_ad_token", None
) ) or get_secret("AZURE_AD_TOKEN")
headers = headers or litellm.headers headers = headers or litellm.headers
@ -967,6 +996,7 @@ def completion(
encoding=encoding, # for calculating input/output tokens encoding=encoding, # for calculating input/output tokens
api_key=api_key, api_key=api_key,
logging_obj=logging, logging_obj=logging,
headers=headers,
) )
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
@ -1376,11 +1406,29 @@ def completion(
acompletion=acompletion, acompletion=acompletion,
custom_prompt_dict=custom_prompt_dict, custom_prompt_dict=custom_prompt_dict,
) )
if (
"stream" in optional_params
and optional_params["stream"] == True
and acompletion == False
):
response = CustomStreamWrapper(
iter(model_response),
model,
custom_llm_provider="gemini",
logging_obj=logging,
)
return response
response = model_response response = model_response
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT") vertex_ai_project = (
vertex_ai_location = litellm.vertex_location or get_secret( optional_params.pop("vertex_ai_project", None)
"VERTEXAI_LOCATION" or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
) )
model_response = vertex_ai.completion( model_response = vertex_ai.completion(
@ -1471,19 +1519,22 @@ def completion(
if ( if (
"stream" in optional_params and optional_params["stream"] == True "stream" in optional_params and optional_params["stream"] == True
): ## [BETA] ): ## [BETA]
# sagemaker does not support streaming as of now so we're faking streaming:
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
# "SageMaker is currently not supporting streaming responses."
# fake streaming for sagemaker
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER") print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
resp_string = model_response["choices"][0]["message"]["content"] from .llms.sagemaker import TokenIterator
tokenIterator = TokenIterator(model_response)
response = CustomStreamWrapper( response = CustomStreamWrapper(
resp_string, completion_stream=tokenIterator,
model, model=model,
custom_llm_provider="sagemaker", custom_llm_provider="sagemaker",
logging_obj=logging, logging_obj=logging,
) )
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
return response return response
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -2176,6 +2227,7 @@ def embedding(
model, model,
input=[], input=[],
# Optional params # Optional params
dimensions: Optional[int] = None,
timeout=600, # default to 10 minutes timeout=600, # default to 10 minutes
# set api_base, api_version, api_key # set api_base, api_version, api_key
api_base: Optional[str] = None, api_base: Optional[str] = None,
@ -2196,6 +2248,7 @@ def embedding(
Parameters: Parameters:
- model: The embedding model to use. - model: The embedding model to use.
- input: The input for which embeddings are to be generated. - input: The input for which embeddings are to be generated.
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins - timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging. - litellm_call_id: The call ID for litellm logging.
- litellm_logging_obj: The litellm logging object. - litellm_logging_obj: The litellm logging object.
@ -2222,8 +2275,14 @@ def embedding(
encoding_format = kwargs.get("encoding_format", None) encoding_format = kwargs.get("encoding_format", None)
proxy_server_request = kwargs.get("proxy_server_request", None) proxy_server_request = kwargs.get("proxy_server_request", None)
aembedding = kwargs.get("aembedding", None) aembedding = kwargs.get("aembedding", None)
### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
openai_params = [ openai_params = [
"user", "user",
"dimensions",
"request_timeout", "request_timeout",
"api_base", "api_base",
"api_version", "api_version",
@ -2270,6 +2329,8 @@ def embedding(
"tpm", "tpm",
"input_cost_per_token", "input_cost_per_token",
"output_cost_per_token", "output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name", "hf_model_name",
"proxy_server_request", "proxy_server_request",
"model_info", "model_info",
@ -2290,11 +2351,35 @@ def embedding(
api_key=api_key, api_key=api_key,
) )
optional_params = get_optional_params_embeddings( optional_params = get_optional_params_embeddings(
model=model,
user=user, user=user,
dimensions=dimensions,
encoding_format=encoding_format, encoding_format=encoding_format,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
**non_default_params, **non_default_params,
) )
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None:
litellm.register_model(
{
model: {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
}
}
)
if input_cost_per_second is not None: # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
}
}
)
try: try:
response = None response = None
logging = litellm_logging_obj logging = litellm_logging_obj
@ -2916,6 +3001,7 @@ def image_generation(
else: else:
model = "dall-e-2" model = "dall-e-2"
custom_llm_provider = "openai" # default to dall-e-2 on openai custom_llm_provider = "openai" # default to dall-e-2 on openai
model_response._hidden_params["model"] = model
openai_params = [ openai_params = [
"user", "user",
"request_timeout", "request_timeout",
@ -2989,7 +3075,7 @@ def image_generation(
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
**non_default_params, **non_default_params,
) )
logging = litellm_logging_obj logging: Logging = litellm_logging_obj
logging.update_environment_variables( logging.update_environment_variables(
model=model, model=model,
user=user, user=user,
@ -3089,6 +3175,9 @@ async def ahealth_check(
if model is None: if model is None:
raise Exception("model not set") raise Exception("model not set")
if model in litellm.model_cost and mode is None:
mode = litellm.model_cost[model]["mode"]
model, custom_llm_provider, _, _ = get_llm_provider(model=model) model, custom_llm_provider, _, _ = get_llm_provider(model=model)
mode = mode or "chat" # default to chat completion calls mode = mode or "chat" # default to chat completion calls
@ -3263,8 +3352,20 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
return response return response
def stream_chunk_builder(chunks: list, messages: Optional[list] = None): def stream_chunk_builder(
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
):
model_response = litellm.ModelResponse() model_response = litellm.ModelResponse()
### SORT CHUNKS BASED ON CREATED ORDER ##
print_verbose("Goes into checking if chunk has hiddden created at param")
if chunks[0]._hidden_params.get("created_at", None):
print_verbose("Chunks have a created at hidden param")
# Sort chunks based on created_at in ascending order
chunks = sorted(
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
)
print_verbose("Chunks sorted")
# set hidden params from chunk to model_response # set hidden params from chunk to model_response
if model_response is not None and hasattr(model_response, "_hidden_params"): if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params = chunks[0].get("_hidden_params", {}) model_response._hidden_params = chunks[0].get("_hidden_params", {})
@ -3438,5 +3539,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"] response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
) )
return convert_to_model_response_object( return convert_to_model_response_object(
response_object=response, model_response_object=model_response response_object=response,
model_response_object=model_response,
start_time=start_time,
end_time=end_time,
) )

View file

@ -1,8 +1,8 @@
from pydantic import BaseModel, Extra, Field, root_validator from pydantic import BaseModel, Extra, Field, root_validator, Json
import enum import enum
from typing import Optional, List, Union, Dict, Literal from typing import Optional, List, Union, Dict, Literal, Any
from datetime import datetime from datetime import datetime
import uuid, json import uuid, json, sys, os
class LiteLLMBase(BaseModel): class LiteLLMBase(BaseModel):
@ -13,7 +13,7 @@ class LiteLLMBase(BaseModel):
def json(self, **kwargs): def json(self, **kwargs):
try: try:
return self.model_dump() # noqa return self.model_dump() # noqa
except: except Exception as e:
# if using pydantic v1 # if using pydantic v1
return self.dict() return self.dict()
@ -122,27 +122,59 @@ class ModelParams(LiteLLMBase):
return values return values
class GenerateKeyRequest(LiteLLMBase): class GenerateRequestBase(LiteLLMBase):
duration: Optional[str] = "1h" """
Overlapping schema between key and user generate/update requests
"""
models: Optional[list] = [] models: Optional[list] = []
spend: Optional[float] = 0
max_budget: Optional[float] = None
user_id: Optional[str] = None
team_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
tpm_limit: Optional[int] = None
rpm_limit: Optional[int] = None
budget_duration: Optional[str] = None
class GenerateKeyRequest(GenerateRequestBase):
key_alias: Optional[str] = None
duration: Optional[str] = None
aliases: Optional[dict] = {} aliases: Optional[dict] = {}
config: Optional[dict] = {} config: Optional[dict] = {}
spend: Optional[float] = 0
user_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
class UpdateKeyRequest(LiteLLMBase): class GenerateKeyResponse(GenerateKeyRequest):
key: str
key_name: Optional[str] = None
expires: Optional[datetime]
user_id: str
@root_validator(pre=True)
def set_model_info(cls, values):
if values.get("token") is not None:
values.update({"key": values.get("token")})
dict_fields = ["metadata", "aliases", "config"]
for field in dict_fields:
value = values.get(field)
if value is not None and isinstance(value, str):
try:
values[field] = json.loads(value)
except json.JSONDecodeError:
raise ValueError(f"Field {field} should be a valid dictionary")
return values
class UpdateKeyRequest(GenerateKeyRequest):
# Note: the defaults of all Params here MUST BE NONE
# else they will get overwritten
key: str key: str
duration: Optional[str] = None duration: Optional[str] = None
models: Optional[list] = None
aliases: Optional[dict] = None
config: Optional[dict] = None
spend: Optional[float] = None spend: Optional[float] = None
user_id: Optional[str] = None metadata: Optional[dict] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
class UserAPIKeyAuth(LiteLLMBase): # the expected response object for user api key auth class UserAPIKeyAuth(LiteLLMBase): # the expected response object for user api key auth
@ -155,20 +187,17 @@ class UserAPIKeyAuth(LiteLLMBase): # the expected response object for user api
aliases: dict = {} aliases: dict = {}
config: dict = {} config: dict = {}
spend: Optional[float] = 0 spend: Optional[float] = 0
max_budget: Optional[float] = None
user_id: Optional[str] = None user_id: Optional[str] = None
max_parallel_requests: Optional[int] = None max_parallel_requests: Optional[int] = None
duration: str = "1h" duration: str = "1h"
metadata: dict = {} metadata: dict = {}
tpm_limit: Optional[int] = None
rpm_limit: Optional[int] = None
class GenerateKeyResponse(LiteLLMBase):
key: str
expires: Optional[datetime]
user_id: str
class DeleteKeyRequest(LiteLLMBase): class DeleteKeyRequest(LiteLLMBase):
keys: List[str] keys: List
class NewUserRequest(GenerateKeyRequest): class NewUserRequest(GenerateKeyRequest):
@ -179,6 +208,14 @@ class NewUserResponse(GenerateKeyResponse):
max_budget: Optional[float] = None max_budget: Optional[float] = None
class UpdateUserRequest(GenerateRequestBase):
# Note: the defaults of all Params here MUST BE NONE
# else they will get overwritten
user_id: str
spend: Optional[float] = None
metadata: Optional[dict] = None
class KeyManagementSystem(enum.Enum): class KeyManagementSystem(enum.Enum):
GOOGLE_KMS = "google_kms" GOOGLE_KMS = "google_kms"
AZURE_KEY_VAULT = "azure_key_vault" AZURE_KEY_VAULT = "azure_key_vault"
@ -194,6 +231,7 @@ class DynamoDBArgs(LiteLLMBase):
user_table_name: str = "LiteLLM_UserTable" user_table_name: str = "LiteLLM_UserTable"
key_table_name: str = "LiteLLM_VerificationToken" key_table_name: str = "LiteLLM_VerificationToken"
config_table_name: str = "LiteLLM_Config" config_table_name: str = "LiteLLM_Config"
spend_table_name: str = "LiteLLM_SpendLogs"
class ConfigGeneralSettings(LiteLLMBase): class ConfigGeneralSettings(LiteLLMBase):
@ -283,7 +321,10 @@ class ConfigYAML(LiteLLMBase):
class LiteLLM_VerificationToken(LiteLLMBase): class LiteLLM_VerificationToken(LiteLLMBase):
token: str token: str
key_name: Optional[str] = None
key_alias: Optional[str] = None
spend: float = 0.0 spend: float = 0.0
max_budget: Optional[float] = None
expires: Union[str, None] expires: Union[str, None]
models: List[str] models: List[str]
aliases: Dict[str, str] = {} aliases: Dict[str, str] = {}
@ -291,6 +332,10 @@ class LiteLLM_VerificationToken(LiteLLMBase):
user_id: Union[str, None] user_id: Union[str, None]
max_parallel_requests: Union[int, None] max_parallel_requests: Union[int, None]
metadata: Dict[str, str] = {} metadata: Dict[str, str] = {}
tpm_limit: Optional[int] = None
rpm_limit: Optional[int] = None
budget_duration: Optional[str] = None
budget_reset_at: Optional[datetime] = None
class LiteLLM_Config(LiteLLMBase): class LiteLLM_Config(LiteLLMBase):
@ -310,5 +355,22 @@ class LiteLLM_UserTable(LiteLLMBase):
if values.get("spend") is None: if values.get("spend") is None:
values.update({"spend": 0.0}) values.update({"spend": 0.0})
if values.get("models") is None: if values.get("models") is None:
values.update({"models", []}) values.update({"models": []})
return values return values
class LiteLLM_SpendLogs(LiteLLMBase):
request_id: str
api_key: str
model: Optional[str] = ""
call_type: str
spend: Optional[float] = 0.0
total_tokens: Optional[int] = 0
prompt_tokens: Optional[int] = 0
completion_tokens: Optional[int] = 0
startTime: Union[str, datetime, None]
endTime: Union[str, datetime, None]
user: Optional[str] = ""
metadata: Optional[Json] = {}
cache_hit: Optional[str] = "False"
cache_key: Optional[str] = None

View file

@ -98,7 +98,7 @@ def list_models():
st.error(f"An error occurred while requesting models: {e}") st.error(f"An error occurred while requesting models: {e}")
else: else:
st.warning( st.warning(
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page." f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
) )
@ -151,7 +151,7 @@ def create_key():
raise e raise e
else: else:
st.warning( st.warning(
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page." f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
) )

View file

@ -5,6 +5,7 @@ from litellm.proxy._types import (
LiteLLM_Config, LiteLLM_Config,
LiteLLM_UserTable, LiteLLM_UserTable,
) )
from litellm.proxy.utils import hash_token
from litellm import get_secret from litellm import get_secret
from typing import Any, List, Literal, Optional, Union from typing import Any, List, Literal, Optional, Union
import json import json
@ -131,10 +132,27 @@ class DynamoDBWrapper(CustomDB):
raise Exception( raise Exception(
f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'" f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'"
) )
## Spend
try:
verbose_proxy_logger.debug("DynamoDB Wrapper - Creating Spend Table")
error_occurred = False
table = client.table(self.database_arguments.spend_table_name)
if not await table.exists():
await table.create(
self.throughput_type,
KeySchema(hash_key=KeySpec("request_id", KeyType.string)),
)
except Exception as e:
error_occurred = True
if error_occurred == True:
raise Exception(
f"Failed to create table - {self.database_arguments.key_table_name}.\nPlease create a new table called {self.database_arguments.key_table_name}\nAND set `hash_key` as 'token'"
)
verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()") verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()")
async def insert_data( async def insert_data(
self, value: Any, table_name: Literal["user", "key", "config"] self, value: Any, table_name: Literal["user", "key", "config", "spend"]
): ):
from aiodynamo.client import Client from aiodynamo.client import Client
from aiodynamo.credentials import Credentials, StaticCredentials from aiodynamo.credentials import Credentials, StaticCredentials
@ -166,8 +184,13 @@ class DynamoDBWrapper(CustomDB):
table = client.table(self.database_arguments.key_table_name) table = client.table(self.database_arguments.key_table_name)
elif table_name == "config": elif table_name == "config":
table = client.table(self.database_arguments.config_table_name) table = client.table(self.database_arguments.config_table_name)
elif table_name == "spend":
table = client.table(self.database_arguments.spend_table_name)
value = value.copy()
for k, v in value.items(): for k, v in value.items():
if k == "token" and value[k].startswith("sk-"):
value[k] = hash_token(token=v)
if isinstance(v, datetime): if isinstance(v, datetime):
value[k] = v.isoformat() value[k] = v.isoformat()
@ -224,6 +247,10 @@ class DynamoDBWrapper(CustomDB):
and isinstance(v, str) and isinstance(v, str)
): ):
new_response[k] = json.loads(v) new_response[k] = json.loads(v)
elif (k == "tpm_limit" or k == "rpm_limit") and isinstance(
v, float
):
new_response[k] = int(v)
else: else:
new_response[k] = v new_response[k] = v
new_response = LiteLLM_VerificationToken(**new_response) new_response = LiteLLM_VerificationToken(**new_response)
@ -281,10 +308,13 @@ class DynamoDBWrapper(CustomDB):
# Initialize an empty UpdateExpression # Initialize an empty UpdateExpression
actions: List = [] actions: List = []
value = value.copy()
for k, v in value.items(): for k, v in value.items():
# Convert datetime object to ISO8601 string # Convert datetime object to ISO8601 string
if isinstance(v, datetime): if isinstance(v, datetime):
v = v.isoformat() v = v.isoformat()
if k == "token" and value[k].startswith("sk-"):
value[k] = hash_token(token=v)
# Accumulate updates # Accumulate updates
actions.append((F(k), Value(value=v))) actions.append((F(k), Value(value=v)))

View file

@ -1,4 +1,4 @@
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth, GenerateKeyRequest
from fastapi import Request from fastapi import Request
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
@ -14,3 +14,40 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
raise Exception raise Exception
except: except:
raise Exception raise Exception
async def generate_key_fn(data: GenerateKeyRequest):
"""
Asynchronously decides if a key should be generated or not based on the provided data.
Args:
data (GenerateKeyRequest): The data to be used for decision making.
Returns:
bool: True if a key should be generated, False otherwise.
"""
# decide if a key should be generated or not
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and len(team_id) > 0:
return {
"decision": True,
}
else:
return {
"decision": True,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}

View file

@ -1,9 +1,12 @@
from typing import Optional from typing import Optional
import litellm import litellm, traceback, sys
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from fastapi import HTTPException from fastapi import HTTPException
from litellm._logging import verbose_proxy_logger
from litellm import ModelResponse
from datetime import datetime
class MaxParallelRequestsHandler(CustomLogger): class MaxParallelRequestsHandler(CustomLogger):
@ -14,8 +17,7 @@ class MaxParallelRequestsHandler(CustomLogger):
pass pass
def print_verbose(self, print_statement): def print_verbose(self, print_statement):
if litellm.set_verbose is True: verbose_proxy_logger.debug(print_statement)
print(print_statement) # noqa
async def async_pre_call_hook( async def async_pre_call_hook(
self, self,
@ -26,25 +28,56 @@ class MaxParallelRequestsHandler(CustomLogger):
): ):
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook") self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
api_key = user_api_key_dict.api_key api_key = user_api_key_dict.api_key
max_parallel_requests = user_api_key_dict.max_parallel_requests max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
tpm_limit = user_api_key_dict.tpm_limit or sys.maxsize
rpm_limit = user_api_key_dict.rpm_limit or sys.maxsize
if api_key is None: if api_key is None:
return return
if max_parallel_requests is None: if (
max_parallel_requests == sys.maxsize
and tpm_limit == sys.maxsize
and rpm_limit == sys.maxsize
):
return return
self.user_api_key_cache = cache # save the api key cache for updating the value self.user_api_key_cache = cache # save the api key cache for updating the value
# ------------
# Setup values
# ------------
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{api_key}::{precise_minute}::request_count"
# CHECK IF REQUEST ALLOWED # CHECK IF REQUEST ALLOWED
request_count_api_key = f"{api_key}_request_count" current = cache.get_cache(
current = cache.get_cache(key=request_count_api_key) key=request_count_api_key
) # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
self.print_verbose(f"current: {current}") self.print_verbose(f"current: {current}")
if current is None: if current is None:
cache.set_cache(request_count_api_key, 1) new_val = {
elif int(current) < max_parallel_requests: "current_requests": 1,
"current_tpm": 0,
"current_rpm": 0,
}
cache.set_cache(request_count_api_key, new_val)
elif (
int(current["current_requests"]) < max_parallel_requests
and current["current_tpm"] < tpm_limit
and current["current_rpm"] < rpm_limit
):
# Increase count for this token # Increase count for this token
cache.set_cache(request_count_api_key, int(current) + 1) new_val = {
"current_requests": current["current_requests"] + 1,
"current_tpm": current["current_tpm"],
"current_rpm": current["current_rpm"],
}
cache.set_cache(request_count_api_key, new_val)
else: else:
raise HTTPException( raise HTTPException(
status_code=429, detail="Max parallel request limit reached." status_code=429, detail="Max parallel request limit reached."
@ -52,7 +85,7 @@ class MaxParallelRequestsHandler(CustomLogger):
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try: try:
self.print_verbose(f"INSIDE ASYNC SUCCESS LOGGING") self.print_verbose(f"INSIDE parallel request limiter ASYNC SUCCESS LOGGING")
user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"] user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
if user_api_key is None: if user_api_key is None:
return return
@ -60,29 +93,50 @@ class MaxParallelRequestsHandler(CustomLogger):
if self.user_api_key_cache is None: if self.user_api_key_cache is None:
return return
request_count_api_key = f"{user_api_key}_request_count" # ------------
# check if it has collected an entire stream response # Setup values
self.print_verbose( # ------------
f"'complete_streaming_response' is in kwargs: {'complete_streaming_response' in kwargs}"
) current_date = datetime.now().strftime("%Y-%m-%d")
if "complete_streaming_response" in kwargs or kwargs["stream"] != True: current_hour = datetime.now().strftime("%H")
# Decrease count for this token current_minute = datetime.now().strftime("%M")
current = ( precise_minute = f"{current_date}-{current_hour}-{current_minute}"
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
) total_tokens = 0
new_val = current - 1
self.print_verbose(f"updated_value in success call: {new_val}") if isinstance(response_obj, ModelResponse):
self.user_api_key_cache.set_cache(request_count_api_key, new_val) total_tokens = response_obj.usage.total_tokens
request_count_api_key = f"{user_api_key}::{precise_minute}::request_count"
current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
"current_requests": 1,
"current_tpm": total_tokens,
"current_rpm": 1,
}
# ------------
# Update usage
# ------------
new_val = {
"current_requests": current["current_requests"] - 1,
"current_tpm": current["current_tpm"] + total_tokens,
"current_rpm": current["current_rpm"] + 1,
}
self.print_verbose(f"updated_value in success call: {new_val}")
self.user_api_key_cache.set_cache(
request_count_api_key, new_val, ttl=60
) # store in cache for 1 min.
except Exception as e: except Exception as e:
self.print_verbose(e) # noqa self.print_verbose(e) # noqa
async def async_log_failure_call( async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
self, user_api_key_dict: UserAPIKeyAuth, original_exception: Exception
):
try: try:
self.print_verbose(f"Inside Max Parallel Request Failure Hook") self.print_verbose(f"Inside Max Parallel Request Failure Hook")
api_key = user_api_key_dict.api_key user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
if api_key is None: if user_api_key is None:
return return
if self.user_api_key_cache is None: if self.user_api_key_cache is None:
@ -90,19 +144,46 @@ class MaxParallelRequestsHandler(CustomLogger):
## decrement call count if call failed ## decrement call count if call failed
if ( if (
hasattr(original_exception, "status_code") hasattr(kwargs["exception"], "status_code")
and original_exception.status_code == 429 and kwargs["exception"].status_code == 429
and "Max parallel request limit reached" in str(original_exception) and "Max parallel request limit reached" in str(kwargs["exception"])
): ):
pass # ignore failed calls due to max limit being reached pass # ignore failed calls due to max limit being reached
else: else:
request_count_api_key = f"{api_key}_request_count" # ------------
# Decrease count for this token # Setup values
current = ( # ------------
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = (
f"{user_api_key}::{precise_minute}::request_count"
) )
new_val = current - 1
# ------------
# Update usage
# ------------
current = self.user_api_key_cache.get_cache(
key=request_count_api_key
) or {
"current_requests": 1,
"current_tpm": 0,
"current_rpm": 0,
}
new_val = {
"current_requests": current["current_requests"] - 1,
"current_tpm": current["current_tpm"],
"current_rpm": current["current_rpm"],
}
self.print_verbose(f"updated_value in failure call: {new_val}") self.print_verbose(f"updated_value in failure call: {new_val}")
self.user_api_key_cache.set_cache(request_count_api_key, new_val) self.user_api_key_cache.set_cache(
request_count_api_key, new_val, ttl=60
) # save in cache for up to 1 min.
except Exception as e: except Exception as e:
self.print_verbose(f"An exception occurred - {str(e)}") # noqa print(f"An exception occurred - {str(e)}") # noqa

View file

@ -157,6 +157,12 @@ def is_port_in_use(port):
type=int, type=int,
help="Number of requests to hit async endpoint with", help="Number of requests to hit async endpoint with",
) )
@click.option(
"--run_gunicorn",
default=False,
is_flag=True,
help="Starts proxy via gunicorn, instead of uvicorn (better for managing multiple workers)",
)
@click.option("--local", is_flag=True, default=False, help="for local debugging") @click.option("--local", is_flag=True, default=False, help="for local debugging")
def run_server( def run_server(
host, host,
@ -186,21 +192,32 @@ def run_server(
use_queue, use_queue,
health, health,
version, version,
run_gunicorn,
): ):
global feature_telemetry global feature_telemetry
args = locals() args = locals()
if local: if local:
from proxy_server import app, save_worker_config, usage_telemetry from proxy_server import app, save_worker_config, usage_telemetry, ProxyConfig
else: else:
try: try:
from .proxy_server import app, save_worker_config, usage_telemetry from .proxy_server import (
app,
save_worker_config,
usage_telemetry,
ProxyConfig,
)
except ImportError as e: except ImportError as e:
if "litellm[proxy]" in str(e): if "litellm[proxy]" in str(e):
# user is missing a proxy dependency, ask them to pip install litellm[proxy] # user is missing a proxy dependency, ask them to pip install litellm[proxy]
raise e raise e
else: else:
# this is just a local/relative import error, user git cloned litellm # this is just a local/relative import error, user git cloned litellm
from proxy_server import app, save_worker_config, usage_telemetry from proxy_server import (
app,
save_worker_config,
usage_telemetry,
ProxyConfig,
)
feature_telemetry = usage_telemetry feature_telemetry = usage_telemetry
if version == True: if version == True:
pkg_version = importlib.metadata.version("litellm") pkg_version = importlib.metadata.version("litellm")
@ -373,16 +390,16 @@ def run_server(
read from there and save it to os.env['DATABASE_URL'] read from there and save it to os.env['DATABASE_URL']
""" """
try: try:
import yaml import yaml, asyncio
except: except:
raise ImportError( raise ImportError(
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`" "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
) )
if os.path.exists(config): proxy_config = ProxyConfig()
with open(config, "r") as config_file: _, _, general_settings = asyncio.run(
config = yaml.safe_load(config_file) proxy_config.load_config(router=None, config_file_path=config)
general_settings = config.get("general_settings", {}) )
database_url = general_settings.get("database_url", None) database_url = general_settings.get("database_url", None)
if database_url and database_url.startswith("os.environ/"): if database_url and database_url.startswith("os.environ/"):
original_dir = os.getcwd() original_dir = os.getcwd()
@ -418,6 +435,7 @@ def run_server(
break # Exit the loop if the subprocess succeeds break # Exit the loop if the subprocess succeeds
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Error: {e}") print(f"Error: {e}")
time.sleep(random.randrange(start=1, stop=5))
finally: finally:
os.chdir(original_dir) os.chdir(original_dir)
else: else:
@ -428,9 +446,9 @@ def run_server(
port = random.randint(1024, 49152) port = random.randint(1024, 49152)
from litellm.proxy.proxy_server import app from litellm.proxy.proxy_server import app
if os.name == "nt": if run_gunicorn == False:
uvicorn.run(app, host=host, port=port) # run uvicorn uvicorn.run(app, host=host, port=port) # run uvicorn
else: elif run_gunicorn == True:
import gunicorn.app.base import gunicorn.app.base
# Gunicorn Application Class # Gunicorn Application Class

View file

@ -11,6 +11,12 @@ model_list:
output_cost_per_token: 0.00003 output_cost_per_token: 0.00003
max_tokens: 4096 max_tokens: 4096
base_model: gpt-3.5-turbo base_model: gpt-3.5-turbo
- model_name: gpt-4
litellm_params:
model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: gpt-vision - model_name: gpt-vision
litellm_params: litellm_params:
model: azure/gpt-4-vision model: azure/gpt-4-vision
@ -25,6 +31,9 @@ model_list:
- model_name: BEDROCK_GROUP - model_name: BEDROCK_GROUP
litellm_params: litellm_params:
model: bedrock/cohere.command-text-v14 model: bedrock/cohere.command-text-v14
- model_name: tg-ai
litellm_params:
model: together_ai/mistralai/Mistral-7B-Instruct-v0.1
- model_name: sagemaker - model_name: sagemaker
litellm_params: litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4 model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
@ -57,12 +66,22 @@ model_list:
mode: embedding mode: embedding
litellm_settings: litellm_settings:
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}] fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
success_callback: ['langfuse']
max_budget: 10 # global budget for proxy
budget_duration: 30d # global budget duration, will reset after 30d
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: None
# cache: True # cache: True
# setting callback class # setting callback class
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
# general_settings: general_settings:
# master_key: sk-1234 allow_user_auth: True
master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
# database_type: "dynamo_db" # database_type: "dynamo_db"
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190 # database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
# "billing_mode": "PAY_PER_REQUEST", # "billing_mode": "PAY_PER_REQUEST",

File diff suppressed because it is too large Load diff

View file

@ -7,28 +7,62 @@ generator client {
provider = "prisma-client-py" provider = "prisma-client-py"
} }
// Track spend, rate limit, budget Users
model LiteLLM_UserTable { model LiteLLM_UserTable {
user_id String @unique user_id String @unique
team_id String?
max_budget Float? max_budget Float?
spend Float @default(0.0) spend Float @default(0.0)
user_email String? user_email String?
models String[] @default([]) models String[]
max_parallel_requests Int?
tpm_limit BigInt?
rpm_limit BigInt?
budget_duration String?
budget_reset_at DateTime?
} }
// required for token gen // Generate Tokens for Proxy
model LiteLLM_VerificationToken { model LiteLLM_VerificationToken {
token String @unique token String @unique
key_name String?
key_alias String?
spend Float @default(0.0) spend Float @default(0.0)
expires DateTime? expires DateTime?
models String[] @default([]) models String[]
aliases Json @default("{}") aliases Json @default("{}")
config Json @default("{}") config Json @default("{}")
user_id String? user_id String?
team_id String?
max_parallel_requests Int? max_parallel_requests Int?
metadata Json @default("{}") metadata Json @default("{}")
tpm_limit BigInt?
rpm_limit BigInt?
max_budget Float?
budget_duration String?
budget_reset_at DateTime?
} }
// store proxy config.yaml
model LiteLLM_Config { model LiteLLM_Config {
param_name String @id param_name String @id
param_value Json? param_value Json?
} }
// View spend, model, api_key per request
model LiteLLM_SpendLogs {
request_id String @unique
call_type String
api_key String @default ("")
spend Float @default(0.0)
total_tokens Int @default(0)
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
model String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")
cache_key String @default("")
}

View file

@ -11,12 +11,10 @@ async def litellm_completion():
# Your existing code for litellm_completion goes here # Your existing code for litellm_completion goes here
try: try:
response = await litellm_client.chat.completions.create( response = await litellm_client.chat.completions.create(
model="Azure OpenAI GPT-4 Canada-East (External)", model="azure-gpt-3.5",
stream=True,
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
) )
async for chunk in response: print(response)
print(chunk)
return response return response
except Exception as e: except Exception as e:
@ -27,9 +25,9 @@ async def litellm_completion():
async def main(): async def main():
for i in range(1000000): for i in range(150):
start = time.time() start = time.time()
n = 1000 # Number of concurrent tasks n = 150 # Number of concurrent tasks
tasks = [litellm_completion() for _ in range(n)] tasks = [litellm_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks) chat_completions = await asyncio.gather(*tasks)

View file

@ -4,22 +4,28 @@ const openai = require('openai');
process.env.DEBUG=false; process.env.DEBUG=false;
async function runOpenAI() { async function runOpenAI() {
const client = new openai.OpenAI({ const client = new openai.OpenAI({
apiKey: 'your_api_key_here', apiKey: 'sk-JkKeNi6WpWDngBsghJ6B9g',
baseURL: 'http://0.0.0.0:8000' baseURL: 'http://0.0.0.0:8000'
}); });
try { try {
const response = await client.chat.completions.create({ const response = await client.chat.completions.create({
model: 'azure-gpt-3.5', model: 'sagemaker',
stream: true,
max_tokens: 1000,
messages: [ messages: [
{ {
role: 'user', role: 'user',
content: 'this is a test request, write a short poem'.repeat(2000), content: 'write a 20 pg essay about YC ',
}, },
], ],
}); });
console.log(response); console.log(response);
for await (const chunk of response) {
console.log(chunk);
console.log(chunk.choices[0].delta.content);
}
} catch (error) { } catch (error) {
console.log("got this exception from server"); console.log("got this exception from server");
console.error(error); console.error(error);

View file

@ -1,7 +1,12 @@
from typing import Optional, List, Any, Literal, Union from typing import Optional, List, Any, Literal, Union
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx
import litellm, backoff import litellm, backoff
from litellm.proxy._types import UserAPIKeyAuth, DynamoDBArgs from litellm.proxy._types import (
UserAPIKeyAuth,
DynamoDBArgs,
LiteLLM_VerificationToken,
LiteLLM_SpendLogs,
)
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter
@ -9,10 +14,10 @@ from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy.db.base_client import CustomDB from litellm.proxy.db.base_client import CustomDB
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from fastapi import HTTPException, status from fastapi import HTTPException, status
import smtplib import smtplib, re
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from datetime import datetime from datetime import datetime, timedelta
def print_verbose(print_statement): def print_verbose(print_statement):
@ -92,7 +97,7 @@ class ProxyLogging:
3. /image/generation 3. /image/generation
""" """
### ALERTING ### ### ALERTING ###
asyncio.create_task(self.response_taking_too_long()) asyncio.create_task(self.response_taking_too_long(request_data=data))
try: try:
for callback in litellm.callbacks: for callback in litellm.callbacks:
@ -132,27 +137,113 @@ class ProxyLogging:
start_time: Optional[float] = None, start_time: Optional[float] = None,
end_time: Optional[float] = None, end_time: Optional[float] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request", type: Literal["hanging_request", "slow_response"] = "hanging_request",
request_data: Optional[dict] = None,
): ):
if request_data is not None:
model = request_data.get("model", "")
messages = request_data.get("messages", "")
# try casting messages to str and get the first 100 characters, else mark as None
try:
messages = str(messages)
messages = messages[:10000]
except:
messages = None
request_info = f"\nRequest Model: {model}\nMessages: {messages}"
else:
request_info = ""
if type == "hanging_request": if type == "hanging_request":
# Simulate a long-running operation that could take more than 5 minutes # Simulate a long-running operation that could take more than 5 minutes
await asyncio.sleep( await asyncio.sleep(
self.alerting_threshold self.alerting_threshold
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
if (
await self.alerting_handler( request_data is not None
message=f"Requests are hanging - {self.alerting_threshold}s+ request time", and request_data.get("litellm_status", "") != "success"
level="Medium", ):
) # only alert hanging responses if they have not been marked as success
alerting_message = (
f"Requests are hanging - {self.alerting_threshold}s+ request time"
)
await self.alerting_handler(
message=alerting_message + request_info,
level="Medium",
)
elif ( elif (
type == "slow_response" and start_time is not None and end_time is not None type == "slow_response" and start_time is not None and end_time is not None
): ):
slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s"
if end_time - start_time > self.alerting_threshold: if end_time - start_time > self.alerting_threshold:
await self.alerting_handler( await self.alerting_handler(
message=f"Responses are slow - {round(end_time-start_time,2)}s response time", message=slow_message + request_info,
level="Low", level="Low",
) )
async def budget_alerts(
self,
type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
user_max_budget: float,
user_current_spend: float,
user_info=None,
):
if self.alerting is None:
# do nothing if alerting is not switched on
return
if type == "user_and_proxy_budget":
user_info = dict(user_info)
user_id = user_info["user_id"]
max_budget = user_info["max_budget"]
spend = user_info["spend"]
user_email = user_info["user_email"]
user_info = f"""\nUser ID: {user_id}\nMax Budget: ${max_budget}\nSpend: ${spend}\nUser Email: {user_email}"""
elif type == "token_budget":
token_info = dict(user_info)
token = token_info["token"]
spend = token_info["spend"]
max_budget = token_info["max_budget"]
user_id = token_info["user_id"]
user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}"""
else:
user_info = str(user_info)
# percent of max_budget left to spend
percent_left = (user_max_budget - user_current_spend) / user_max_budget
verbose_proxy_logger.debug(
f"Budget Alerts: Percent left: {percent_left} for {user_info}"
)
# check if crossed budget
if user_current_spend >= user_max_budget:
verbose_proxy_logger.debug(f"Budget Crossed for {user_info}")
message = "Budget Crossed for" + user_info
await self.alerting_handler(
message=message,
level="High",
)
return
# check if 5% of max budget is left
if percent_left <= 0.05:
message = "5% budget left for" + user_info
await self.alerting_handler(
message=message,
level="Medium",
)
return
# check if 15% of max budget is left
if percent_left <= 0.15:
message = "15% budget left for" + user_info
await self.alerting_handler(
message=message,
level="Low",
)
return
return
async def alerting_handler( async def alerting_handler(
self, message: str, level: Literal["Low", "Medium", "High"] self, message: str, level: Literal["Low", "Medium", "High"]
): ):
@ -163,12 +254,20 @@ class ProxyLogging:
- Requests are hanging - Requests are hanging
- Calls are failing - Calls are failing
- DB Read/Writes are failing - DB Read/Writes are failing
- Proxy Close to max budget
- Key Close to max budget
Parameters: Parameters:
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
message: str - what is the alert about message: str - what is the alert about
""" """
formatted_message = f"Level: {level}\n\nMessage: {message}" from datetime import datetime
# Get the current timestamp
current_time = datetime.now().strftime("%H:%M:%S")
formatted_message = (
f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}"
)
if self.alerting is None: if self.alerting is None:
return return
@ -179,7 +278,9 @@ class ProxyLogging:
raise Exception("Missing SLACK_WEBHOOK_URL from environment") raise Exception("Missing SLACK_WEBHOOK_URL from environment")
payload = {"text": formatted_message} payload = {"text": formatted_message}
headers = {"Content-type": "application/json"} headers = {"Content-type": "application/json"}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False)
) as session:
async with session.post( async with session.post(
slack_webhook_url, json=payload, headers=headers slack_webhook_url, json=payload, headers=headers
) as response: ) as response:
@ -316,7 +417,7 @@ class PrismaClient:
self, self,
key: str, key: str,
value: Any, value: Any,
table_name: Literal["users", "keys", "config"], table_name: Literal["users", "keys", "config", "spend"],
): ):
""" """
Generic implementation of get data Generic implementation of get data
@ -334,6 +435,10 @@ class PrismaClient:
response = await self.db.litellm_config.find_first( # type: ignore response = await self.db.litellm_config.find_first( # type: ignore
where={key: value} # type: ignore where={key: value} # type: ignore
) )
elif table_name == "spend":
response = await self.db.l.find_first( # type: ignore
where={key: value} # type: ignore
)
return response return response
except Exception as e: except Exception as e:
asyncio.create_task( asyncio.create_task(
@ -352,8 +457,12 @@ class PrismaClient:
self, self,
token: Optional[str] = None, token: Optional[str] = None,
user_id: Optional[str] = None, user_id: Optional[str] = None,
table_name: Optional[Literal["user", "key", "config"]] = None, user_id_list: Optional[list] = None,
key_val: Optional[dict] = None,
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
query_type: Literal["find_unique", "find_all"] = "find_unique", query_type: Literal["find_unique", "find_all"] = "find_unique",
expires: Optional[datetime] = None,
reset_at: Optional[datetime] = None,
): ):
try: try:
print_verbose("PrismaClient: get_data") print_verbose("PrismaClient: get_data")
@ -365,20 +474,51 @@ class PrismaClient:
hashed_token = token hashed_token = token
if token.startswith("sk-"): if token.startswith("sk-"):
hashed_token = self.hash_token(token=token) hashed_token = self.hash_token(token=token)
print_verbose("PrismaClient: find_unique") verbose_proxy_logger.debug(
f"PrismaClient: find_unique for token: {hashed_token}"
)
if query_type == "find_unique": if query_type == "find_unique":
response = await self.db.litellm_verificationtoken.find_unique( response = await self.db.litellm_verificationtoken.find_unique(
where={"token": hashed_token} where={"token": hashed_token}
) )
if response is not None:
# for prisma we need to cast the expires time to str
if response.expires is not None and isinstance(
response.expires, datetime
):
response.expires = response.expires.isoformat()
elif query_type == "find_all" and user_id is not None: elif query_type == "find_all" and user_id is not None:
response = await self.db.litellm_verificationtoken.find_many( response = await self.db.litellm_verificationtoken.find_many(
where={"user_id": user_id} where={"user_id": user_id}
) )
if response is not None and len(response) > 0:
for r in response:
if isinstance(r.expires, datetime):
r.expires = r.expires.isoformat()
elif (
query_type == "find_all"
and expires is not None
and reset_at is not None
):
response = await self.db.litellm_verificationtoken.find_many(
where={ # type:ignore
"OR": [
{"expires": None},
{"expires": {"gt": expires}},
],
"budget_reset_at": {"lt": reset_at},
}
)
if response is not None and len(response) > 0:
for r in response:
if isinstance(r.expires, datetime):
r.expires = r.expires.isoformat()
elif query_type == "find_all":
response = await self.db.litellm_verificationtoken.find_many(
order={"spend": "desc"},
)
print_verbose(f"PrismaClient: response={response}") print_verbose(f"PrismaClient: response={response}")
if response is not None: if response is not None:
# for prisma we need to cast the expires time to str
if isinstance(response.expires, datetime):
response.expires = response.expires.isoformat()
return response return response
else: else:
# Token does not exist. # Token does not exist.
@ -386,13 +526,61 @@ class PrismaClient:
status_code=status.HTTP_401_UNAUTHORIZED, status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication Error: invalid user key - token does not exist", detail="Authentication Error: invalid user key - token does not exist",
) )
elif user_id is not None: elif user_id is not None or (
response = await self.db.litellm_usertable.find_unique( # type: ignore table_name is not None and table_name == "user"
where={ ):
"user_id": user_id, if query_type == "find_unique":
} response = await self.db.litellm_usertable.find_unique( # type: ignore
) where={
"user_id": user_id, # type: ignore
}
)
elif query_type == "find_all" and reset_at is not None:
response = await self.db.litellm_usertable.find_many(
where={ # type:ignore
"budget_reset_at": {"lt": reset_at},
}
)
elif query_type == "find_all" and user_id_list is not None:
user_id_values = str(tuple(user_id_list))
sql_query = f"""
SELECT *
FROM "LiteLLM_UserTable"
WHERE "user_id" IN {user_id_values}
"""
# Execute the raw query
# The asterisk before `user_id_list` unpacks the list into separate arguments
response = await self.db.query_raw(sql_query)
elif query_type == "find_all":
response = await self.db.litellm_usertable.find_many( # type: ignore
order={"spend": "desc"},
)
return response return response
elif table_name == "spend":
verbose_proxy_logger.debug(
f"PrismaClient: get_data: table_name == 'spend'"
)
if key_val is not None:
if query_type == "find_unique":
response = await self.db.litellm_spendlogs.find_unique( # type: ignore
where={ # type: ignore
key_val["key"]: key_val["value"], # type: ignore
}
)
elif query_type == "find_all":
response = await self.db.litellm_spendlogs.find_many( # type: ignore
where={
key_val["key"]: key_val["value"], # type: ignore
}
)
return response
else:
response = await self.db.litellm_spendlogs.find_many( # type: ignore
order={"startTime": "desc"},
)
return response
except Exception as e: except Exception as e:
print_verbose(f"LiteLLM Prisma Client Exception: {e}") print_verbose(f"LiteLLM Prisma Client Exception: {e}")
import traceback import traceback
@ -412,7 +600,7 @@ class PrismaClient:
on_backoff=on_backoff, # specifying the function to call on backoff on_backoff=on_backoff, # specifying the function to call on backoff
) )
async def insert_data( async def insert_data(
self, data: dict, table_name: Literal["user", "key", "config"] self, data: dict, table_name: Literal["user", "key", "config", "spend"]
): ):
""" """
Add a key to the database. If it already exists, do nothing. Add a key to the database. If it already exists, do nothing.
@ -435,6 +623,7 @@ class PrismaClient:
"update": {}, # don't do anything if it already exists "update": {}, # don't do anything if it already exists
}, },
) )
verbose_proxy_logger.info(f"Data Inserted into Keys Table")
return new_verification_token return new_verification_token
elif table_name == "user": elif table_name == "user":
db_data = self.jsonify_object(data=data) db_data = self.jsonify_object(data=data)
@ -445,6 +634,7 @@ class PrismaClient:
"update": {}, # don't do anything if it already exists "update": {}, # don't do anything if it already exists
}, },
) )
verbose_proxy_logger.info(f"Data Inserted into User Table")
return new_user_row return new_user_row
elif table_name == "config": elif table_name == "config":
""" """
@ -468,8 +658,20 @@ class PrismaClient:
) )
tasks.append(updated_table_row) tasks.append(updated_table_row)
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
verbose_proxy_logger.info(f"Data Inserted into Config Table")
elif table_name == "spend":
db_data = self.jsonify_object(data=data)
new_spend_row = await self.db.litellm_spendlogs.upsert(
where={"request_id": data["request_id"]},
data={
"create": {**db_data}, # type: ignore
"update": {}, # don't do anything if it already exists
},
)
verbose_proxy_logger.info(f"Data Inserted into Spend Table")
return new_spend_row
except Exception as e: except Exception as e:
print_verbose(f"LiteLLM Prisma Client Exception: {e}") print_verbose(f"LiteLLM Prisma Client Exception: {e}")
asyncio.create_task( asyncio.create_task(
@ -489,7 +691,11 @@ class PrismaClient:
self, self,
token: Optional[str] = None, token: Optional[str] = None,
data: dict = {}, data: dict = {},
data_list: Optional[List] = None,
user_id: Optional[str] = None, user_id: Optional[str] = None,
query_type: Literal["update", "update_many"] = "update",
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
update_key_values: Optional[dict] = None,
): ):
""" """
Update existing data Update existing data
@ -506,17 +712,95 @@ class PrismaClient:
where={"token": token}, # type: ignore where={"token": token}, # type: ignore
data={**db_data}, # type: ignore data={**db_data}, # type: ignore
) )
print_verbose("\033[91m" + f"DB write succeeded {response}" + "\033[0m") verbose_proxy_logger.debug(
"\033[91m"
+ f"DB Token Table update succeeded {response}"
+ "\033[0m"
)
return {"token": token, "data": db_data} return {"token": token, "data": db_data}
elif user_id is not None: elif (
user_id is not None
or (table_name is not None and table_name == "user")
and query_type == "update"
):
""" """
If data['spend'] + data['user'], update the user table with spend info as well If data['spend'] + data['user'], update the user table with spend info as well
""" """
update_user_row = await self.db.litellm_usertable.update( if user_id is None:
user_id = db_data["user_id"]
if update_key_values is None:
update_key_values = db_data
update_user_row = await self.db.litellm_usertable.upsert(
where={"user_id": user_id}, # type: ignore where={"user_id": user_id}, # type: ignore
data={**db_data}, # type: ignore data={
"create": {**db_data}, # type: ignore
"update": {
**update_key_values # type: ignore
}, # just update user-specified values, if it already exists
},
)
verbose_proxy_logger.info(
"\033[91m"
+ f"DB User Table - update succeeded {update_user_row}"
+ "\033[0m"
) )
return {"user_id": user_id, "data": db_data} return {"user_id": user_id, "data": db_data}
elif (
table_name is not None
and table_name == "key"
and query_type == "update_many"
and data_list is not None
and isinstance(data_list, list)
):
"""
Batch write update queries
"""
batcher = self.db.batch_()
for idx, t in enumerate(data_list):
# check if plain text or hash
if t.token.startswith("sk-"): # type: ignore
t.token = self.hash_token(token=t.token) # type: ignore
try:
data_json = self.jsonify_object(data=t.model_dump())
except:
data_json = self.jsonify_object(data=t.dict())
batcher.litellm_verificationtoken.update(
where={"token": t.token}, # type: ignore
data={**data_json}, # type: ignore
)
await batcher.commit()
print_verbose(
"\033[91m" + f"DB Token Table update succeeded" + "\033[0m"
)
elif (
table_name is not None
and table_name == "user"
and query_type == "update_many"
and data_list is not None
and isinstance(data_list, list)
):
"""
Batch write update queries
"""
batcher = self.db.batch_()
for idx, user in enumerate(data_list):
try:
data_json = self.jsonify_object(data=user.model_dump())
except:
data_json = self.jsonify_object(data=user.dict())
batcher.litellm_usertable.upsert(
where={"user_id": user.user_id}, # type: ignore
data={
"create": {**data_json}, # type: ignore
"update": {
**data_json # type: ignore
}, # just update user-specified values, if it already exists
},
)
await batcher.commit()
verbose_proxy_logger.info(
"\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
)
except Exception as e: except Exception as e:
asyncio.create_task( asyncio.create_task(
self.proxy_logging_obj.failure_handler(original_exception=e) self.proxy_logging_obj.failure_handler(original_exception=e)
@ -537,7 +821,13 @@ class PrismaClient:
Allow user to delete a key(s) Allow user to delete a key(s)
""" """
try: try:
hashed_tokens = [self.hash_token(token=token) for token in tokens] hashed_tokens = []
for token in tokens:
if isinstance(token, str) and token.startswith("sk-"):
hashed_token = self.hash_token(token=token)
else:
hashed_token = token
hashed_tokens.append(hashed_token)
await self.db.litellm_verificationtoken.delete_many( await self.db.litellm_verificationtoken.delete_many(
where={"token": {"in": hashed_tokens}} where={"token": {"in": hashed_tokens}}
) )
@ -745,7 +1035,8 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
print_verbose(f"SMTP Connection Init") print_verbose(f"SMTP Connection Init")
# Establish a secure connection with the SMTP server # Establish a secure connection with the SMTP server
with smtplib.SMTP(smtp_host, smtp_port) as server: with smtplib.SMTP(smtp_host, smtp_port) as server:
server.starttls() if os.getenv("SMTP_TLS", 'True') != "False":
server.starttls()
# Login to your email account # Login to your email account
server.login(smtp_username, smtp_password) server.login(smtp_username, smtp_password)
@ -754,4 +1045,164 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
server.send_message(email_message) server.send_message(email_message)
except Exception as e: except Exception as e:
print_verbose("An error occurred while sending the email:", str(e)) print_verbose("An error occurred while sending the email:" + str(e))
def hash_token(token: str):
import hashlib
# Hash the string using SHA-256
hashed_token = hashlib.sha256(token.encode()).hexdigest()
return hashed_token
def get_logging_payload(kwargs, response_obj, start_time, end_time):
from litellm.proxy._types import LiteLLM_SpendLogs
from pydantic import Json
import uuid
verbose_proxy_logger.debug(
f"SpendTable: get_logging_payload - kwargs: {kwargs}\n\n"
)
if kwargs == None:
kwargs = {}
# standardize this function to be used across, s3, dynamoDB, langfuse logging
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
call_type = kwargs.get("call_type", "litellm.completion")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj["usage"]
if type(usage) == litellm.Usage:
usage = dict(usage)
id = response_obj.get("id", str(uuid.uuid4()))
api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
# hash the api_key
api_key = hash_token(api_key)
if "headers" in metadata and "authorization" in metadata["headers"]:
metadata["headers"].pop(
"authorization"
) # do not store the original `sk-..` api key in the db
if litellm.cache is not None:
cache_key = litellm.cache.get_cache_key(**kwargs)
else:
cache_key = "Cache OFF"
if cache_hit == True:
import time
id = f"{id}_cache_hit{time.time()}" # SpendLogs does not allow duplicate request_id
payload = {
"request_id": id,
"call_type": call_type,
"api_key": api_key,
"cache_hit": cache_hit,
"startTime": start_time,
"endTime": end_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"metadata": metadata,
"cache_key": cache_key,
"total_tokens": usage.get("total_tokens", 0),
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
}
json_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == Json or field_type == Optional[Json]
]
str_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == str or field_type == Optional[str]
]
datetime_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == datetime
]
for param in json_fields:
if param in payload and type(payload[param]) != Json:
if type(payload[param]) == litellm.ModelResponse:
payload[param] = payload[param].model_dump_json()
if type(payload[param]) == litellm.EmbeddingResponse:
payload[param] = payload[param].model_dump_json()
else:
payload[param] = json.dumps(payload[param])
for param in str_fields:
if param in payload and type(payload[param]) != str:
payload[param] = str(payload[param])
return payload
def _duration_in_seconds(duration: str):
match = re.match(r"(\d+)([smhd]?)", duration)
if not match:
raise ValueError("Invalid duration format")
value, unit = match.groups()
value = int(value)
if unit == "s":
return value
elif unit == "m":
return value * 60
elif unit == "h":
return value * 3600
elif unit == "d":
return value * 86400
else:
raise ValueError("Unsupported duration unit")
async def reset_budget(prisma_client: PrismaClient):
"""
Gets all the non-expired keys for a db, which need spend to be reset
Resets their spend
Updates db
"""
if prisma_client is not None:
### RESET KEY BUDGET ###
now = datetime.utcnow()
keys_to_reset = await prisma_client.get_data(
table_name="key", query_type="find_all", expires=now, reset_at=now
)
if keys_to_reset is not None and len(keys_to_reset) > 0:
for key in keys_to_reset:
key.spend = 0.0
duration_s = _duration_in_seconds(duration=key.budget_duration)
key.budget_reset_at = now + timedelta(seconds=duration_s)
await prisma_client.update_data(
query_type="update_many", data_list=keys_to_reset, table_name="key"
)
### RESET USER BUDGET ###
now = datetime.utcnow()
users_to_reset = await prisma_client.get_data(
table_name="user", query_type="find_all", reset_at=now
)
verbose_proxy_logger.debug(f"users_to_reset from get_data: {users_to_reset}")
if users_to_reset is not None and len(users_to_reset) > 0:
for user in users_to_reset:
user.spend = 0.0
duration_s = _duration_in_seconds(duration=user.budget_duration)
user.budget_reset_at = now + timedelta(seconds=duration_s)
await prisma_client.update_data(
query_type="update_many", data_list=users_to_reset, table_name="user"
)

View file

@ -94,11 +94,15 @@ class Router:
timeout: Optional[float] = None, timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create default_litellm_params={}, # default params for Router.chat.completion.create
set_verbose: bool = False, set_verbose: bool = False,
debug_level: Literal["DEBUG", "INFO"] = "INFO",
fallbacks: List = [], fallbacks: List = [],
allowed_fails: Optional[int] = None,
context_window_fallbacks: List = [], context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {}, model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request retry_after: int = 0, # min time to wait before retrying a failed request
allowed_fails: Optional[
int
] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
routing_strategy: Literal[ routing_strategy: Literal[
"simple-shuffle", "simple-shuffle",
"least-busy", "least-busy",
@ -107,7 +111,42 @@ class Router:
] = "simple-shuffle", ] = "simple-shuffle",
routing_strategy_args: dict = {}, # just for latency-based routing routing_strategy_args: dict = {}, # just for latency-based routing
) -> None: ) -> None:
"""
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
Args:
model_list (Optional[list]): List of models to be used. Defaults to None.
redis_url (Optional[str]): URL of the Redis server. Defaults to None.
redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
redis_port (Optional[int]): Port of the Redis server. Defaults to None.
redis_password (Optional[str]): Password of the Redis server. Defaults to None.
cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
num_retries (int): Number of retries for failed requests. Defaults to 0.
timeout (Optional[float]): Timeout for requests. Defaults to None.
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
set_verbose (bool): Flag to set verbose mode. Defaults to False.
debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
fallbacks (List): List of fallback options. Defaults to [].
context_window_fallbacks (List): List of context window fallback options. Defaults to [].
model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
Returns:
Router: An instance of the litellm.Router class.
"""
self.set_verbose = set_verbose self.set_verbose = set_verbose
if self.set_verbose:
if debug_level == "INFO":
verbose_router_logger.setLevel(logging.INFO)
elif debug_level == "DEBUG":
verbose_router_logger.setLevel(logging.DEBUG)
self.deployment_names: List = ( self.deployment_names: List = (
[] []
) # names of models under litellm_params. ex. azure/chatgpt-v-2 ) # names of models under litellm_params. ex. azure/chatgpt-v-2
@ -157,6 +196,7 @@ class Router:
self.deployment_latency_map[m["litellm_params"]["model"]] = 0 self.deployment_latency_map[m["litellm_params"]["model"]] = 0
self.allowed_fails = allowed_fails or litellm.allowed_fails self.allowed_fails = allowed_fails or litellm.allowed_fails
self.cooldown_time = cooldown_time or 1
self.failed_calls = ( self.failed_calls = (
InMemoryCache() InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown ) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -259,6 +299,7 @@ class Router:
raise e raise e
def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs): def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs):
model_name = None
try: try:
# pick the one that is available (lowest TPM/RPM) # pick the one that is available (lowest TPM/RPM)
deployment = self.get_available_deployment( deployment = self.get_available_deployment(
@ -271,6 +312,7 @@ class Router:
) )
data = deployment["litellm_params"].copy() data = deployment["litellm_params"].copy()
kwargs["model_info"] = deployment.get("model_info", {}) kwargs["model_info"] = deployment.get("model_info", {})
model_name = data["model"]
for k, v in self.default_litellm_params.items(): for k, v in self.default_litellm_params.items():
if ( if (
k not in kwargs k not in kwargs
@ -292,7 +334,7 @@ class Router:
else: else:
model_client = potential_model_client model_client = potential_model_client
return litellm.completion( response = litellm.completion(
**{ **{
**data, **data,
"messages": messages, "messages": messages,
@ -301,7 +343,14 @@ class Router:
**kwargs, **kwargs,
} }
) )
verbose_router_logger.info(
f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
)
return response
except Exception as e: except Exception as e:
verbose_router_logger.info(
f"litellm.completion(model={model_name})\033[31m Exception {str(e)}\033[0m"
)
raise e raise e
async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs): async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
@ -830,6 +879,9 @@ class Router:
""" """
try: try:
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = await self.async_function_with_retries( response = await self.async_function_with_retries(
*args, **kwargs *args, **kwargs
) )
@ -858,8 +910,10 @@ class Router:
f"Falling back to model_group = {mg}" f"Falling back to model_group = {mg}"
) )
kwargs["model"] = mg kwargs["model"] = mg
kwargs["metadata"]["model_group"] = mg kwargs.setdefault("metadata", {}).update(
response = await self.async_function_with_retries( {"model_group": mg}
) # update model_group used, if fallbacks are done
response = await self.async_function_with_fallbacks(
*args, **kwargs *args, **kwargs
) )
return response return response
@ -1024,6 +1078,9 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = self.function_with_fallbacks(*args, **kwargs) response = self.function_with_fallbacks(*args, **kwargs)
return response return response
except Exception as e: except Exception as e:
@ -1047,6 +1104,9 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = self.function_with_fallbacks(*args, **kwargs) response = self.function_with_fallbacks(*args, **kwargs)
return response return response
except Exception as e: except Exception as e:
@ -1232,6 +1292,7 @@ class Router:
verbose_router_logger.debug( verbose_router_logger.debug(
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
) )
cooldown_time = self.cooldown_time or 1
if updated_fails > self.allowed_fails: if updated_fails > self.allowed_fails:
# get the current cooldown list for that minute # get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
@ -1245,13 +1306,19 @@ class Router:
else: else:
cached_value = cached_value + [deployment] cached_value = cached_value + [deployment]
# save updated value # save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1) self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
except: except:
cached_value = [deployment] cached_value = [deployment]
# save updated value # save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1) self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
else: else:
self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1) self.failed_calls.set_cache(
key=deployment, value=updated_fails, ttl=cooldown_time
)
def _get_cooldown_deployments(self): def _get_cooldown_deployments(self):
""" """
@ -1828,6 +1895,9 @@ class Router:
selected_index = random.choices(range(len(rpms)), weights=weights)[0] selected_index = random.choices(range(len(rpms)), weights=weights)[0]
verbose_router_logger.debug(f"\n selected index, {selected_index}") verbose_router_logger.debug(f"\n selected index, {selected_index}")
deployment = healthy_deployments[selected_index] deployment = healthy_deployments[selected_index]
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
)
return deployment or deployment[0] return deployment or deployment[0]
############## Check if we can do a RPM/TPM based weighted pick ################# ############## Check if we can do a RPM/TPM based weighted pick #################
tpm = healthy_deployments[0].get("litellm_params").get("tpm", None) tpm = healthy_deployments[0].get("litellm_params").get("tpm", None)
@ -1842,6 +1912,9 @@ class Router:
selected_index = random.choices(range(len(tpms)), weights=weights)[0] selected_index = random.choices(range(len(tpms)), weights=weights)[0]
verbose_router_logger.debug(f"\n selected index, {selected_index}") verbose_router_logger.debug(f"\n selected index, {selected_index}")
deployment = healthy_deployments[selected_index] deployment = healthy_deployments[selected_index]
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
)
return deployment or deployment[0] return deployment or deployment[0]
############## No RPM/TPM passed, we do a random pick ################# ############## No RPM/TPM passed, we do a random pick #################
@ -1866,8 +1939,13 @@ class Router:
) )
if deployment is None: if deployment is None:
verbose_router_logger.info(
f"get_available_deployment for model: {model}, No deployment available"
)
raise ValueError("No models available.") raise ValueError("No models available.")
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment} for model: {model}"
)
return deployment return deployment
def flush_cache(self): def flush_cache(self):

View file

@ -10,6 +10,7 @@ import traceback
from litellm import token_counter from litellm import token_counter
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_router_logger
class LowestTPMLoggingHandler(CustomLogger): class LowestTPMLoggingHandler(CustomLogger):
@ -130,6 +131,9 @@ class LowestTPMLoggingHandler(CustomLogger):
Returns a deployment with the lowest TPM/RPM usage. Returns a deployment with the lowest TPM/RPM usage.
""" """
# get list of potential deployments # get list of potential deployments
verbose_router_logger.debug(
f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
)
current_minute = datetime.now().strftime("%H-%M") current_minute = datetime.now().strftime("%H-%M")
tpm_key = f"{model_group}:tpm:{current_minute}" tpm_key = f"{model_group}:tpm:{current_minute}"
rpm_key = f"{model_group}:rpm:{current_minute}" rpm_key = f"{model_group}:rpm:{current_minute}"
@ -137,14 +141,31 @@ class LowestTPMLoggingHandler(CustomLogger):
tpm_dict = self.router_cache.get_cache(key=tpm_key) tpm_dict = self.router_cache.get_cache(key=tpm_key)
rpm_dict = self.router_cache.get_cache(key=rpm_key) rpm_dict = self.router_cache.get_cache(key=rpm_key)
verbose_router_logger.debug(
f"tpm_key={tpm_key}, tpm_dict: {tpm_dict}, rpm_dict: {rpm_dict}"
)
try:
input_tokens = token_counter(messages=messages, text=input)
except:
input_tokens = 0
# ----------------------- # -----------------------
# Find lowest used model # Find lowest used model
# ---------------------- # ----------------------
lowest_tpm = float("inf") lowest_tpm = float("inf")
deployment = None deployment = None
if tpm_dict is None: # base case if tpm_dict is None: # base case - none of the deployments have been used
item = random.choice(healthy_deployments) # Return the 1st deployment where deployment["tpm"] >= input_tokens
return item for deployment in healthy_deployments:
_deployment_tpm = (
deployment.get("tpm", None)
or deployment.get("litellm_params", {}).get("tpm", None)
or deployment.get("model_info", {}).get("tpm", None)
or float("inf")
)
if _deployment_tpm >= input_tokens:
return deployment
return None
all_deployments = tpm_dict all_deployments = tpm_dict
for d in healthy_deployments: for d in healthy_deployments:
@ -152,11 +173,6 @@ class LowestTPMLoggingHandler(CustomLogger):
if d["model_info"]["id"] not in all_deployments: if d["model_info"]["id"] not in all_deployments:
all_deployments[d["model_info"]["id"]] = 0 all_deployments[d["model_info"]["id"]] = 0
try:
input_tokens = token_counter(messages=messages, text=input)
except:
input_tokens = 0
for item, item_tpm in all_deployments.items(): for item, item_tpm in all_deployments.items():
## get the item from model list ## get the item from model list
_deployment = None _deployment = None

View file

@ -1,57 +0,0 @@
Starting new HTTPS connection (1): api.anthropic.com:443
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
https://api.anthropic.com:443 "POST /v1/complete HTTP/1.1" 200 None
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'this is a streaming test for llama2 + langfuse'}], 'model': 'gpt-3.5-turbo', 'max_tokens': 20, 'stream': True, 'temperature': 0.2}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f92d0>
start_tls.started ssl_context=<ssl.SSLContext object at 0x108ddf020> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f9290>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:00 GMT'), (b'Content-Type', b'text/event-stream'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-0613'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'62'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999967'), (b'x-ratelimit-remaining-tokens_usage_based', b'999967'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'dd1029a85edecb986fb662945c9f7b4f'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=dnuSnc6BPNJd4lgWKpv3iE2P5zy4r5aCVekXVi7HG7U-1703313180-1-AbeMpAfvmJ6BShULb7tMaErR5ergUrt6ohiXj1e8zoo9AotZ0Jz0alUSUcp8FXyQX2VQ9P6gBUeoSR9aE98OasU=; path=/; expires=Sat, 23-Dec-23 07:03:00 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=dET0GKSNfbtSWNJuXndP8GY8M0ANzDK4Dl7mvIfhmM0-1703313180257-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e920e4f47f4b0-BOM'), (b'alt-svc', b'h3=":443"; ma=86400')])
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': "What's the weather like in San Francisco, Tokyo, and Paris?"}], 'model': 'gpt-3.5-turbo-1106', 'tool_choice': 'auto', 'tools': [{'type': 'function', 'function': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
nction': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"

View file

@ -99,36 +99,68 @@ def pre_langfuse_setup():
return return
@pytest.mark.skip(reason="beta test - checking langfuse output")
def test_langfuse_logging_async(): def test_langfuse_logging_async():
# this tests time added to make langfuse logging calls, vs just acompletion calls
try: try:
pre_langfuse_setup() pre_langfuse_setup()
litellm.set_verbose = True litellm.set_verbose = True
# Make 5 calls with an empty success_callback
litellm.success_callback = []
start_time_empty_callback = asyncio.run(make_async_calls())
print("done with no callback test")
print("starting langfuse test")
# Make 5 calls with success_callback set to "langfuse"
litellm.success_callback = ["langfuse"] litellm.success_callback = ["langfuse"]
start_time_langfuse = asyncio.run(make_async_calls())
print("done with langfuse test")
async def _test_langfuse(): # Compare the time for both scenarios
response = await litellm.acompletion( print(f"Time taken with success_callback='langfuse': {start_time_langfuse}")
model="azure/chatgpt-v-2", print(f"Time taken with empty success_callback: {start_time_empty_callback}")
messages=[{"role": "user", "content": "This is a test"}],
max_tokens=100,
temperature=0.7,
timeout=5,
user="test_user",
)
await asyncio.sleep(1)
return response
response = asyncio.run(_test_langfuse()) # assert the diff is not more than 1 second - this was 5 seconds before the fix
print(f"response: {response}") assert abs(start_time_langfuse - start_time_empty_callback) < 1
# # check langfuse.log to see if there was a failed response
search_logs("langfuse.log")
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {e}") pytest.fail(f"An exception occurred - {e}")
async def make_async_calls():
tasks = []
for _ in range(5):
task = asyncio.create_task(
litellm.acompletion(
model="azure/chatgpt-v-2",
messages=[{"role": "user", "content": "This is a test"}],
max_tokens=5,
temperature=0.7,
timeout=5,
user="langfuse_latency_test_user",
mock_response="It's simple to use and easy to get started",
)
)
tasks.append(task)
# Measure the start time before running the tasks
start_time = asyncio.get_event_loop().time()
# Wait for all tasks to complete
responses = await asyncio.gather(*tasks)
# Print the responses when tasks return
for idx, response in enumerate(responses):
print(f"Response from Task {idx + 1}: {response}")
# Calculate the total time taken
total_time = asyncio.get_event_loop().time() - start_time
return total_time
# def test_langfuse_logging_async_text_completion(): # def test_langfuse_logging_async_text_completion():
# try: # try:
# pre_langfuse_setup() # pre_langfuse_setup()

View file

@ -115,4 +115,103 @@ def test_s3_logging():
print("Passed! Testing async s3 logging") print("Passed! Testing async s3 logging")
test_s3_logging() # test_s3_logging()
def test_s3_logging_r2():
# all s3 requests need to be in one test function
# since we are modifying stdout, and pytests runs tests in parallel
# on circle ci - we only test litellm.acompletion()
try:
# redirect stdout to log_file
# litellm.cache = litellm.Cache(
# type="s3", s3_bucket_name="litellm-r2-bucket", s3_region_name="us-west-2"
# )
litellm.set_verbose = True
from litellm._logging import verbose_logger
import logging
verbose_logger.setLevel(level=logging.DEBUG)
litellm.success_callback = ["s3"]
litellm.s3_callback_params = {
"s3_bucket_name": "litellm-r2-bucket",
"s3_aws_secret_access_key": "os.environ/R2_S3_ACCESS_KEY",
"s3_aws_access_key_id": "os.environ/R2_S3_ACCESS_ID",
"s3_endpoint_url": "os.environ/R2_S3_URL",
"s3_region_name": "os.environ/R2_S3_REGION_NAME",
}
print("Testing async s3 logging")
expected_keys = []
import time
curr_time = str(time.time())
async def _test():
return await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
max_tokens=10,
temperature=0.7,
user="ishaan-2",
)
response = asyncio.run(_test())
print(f"response: {response}")
expected_keys.append(response.id)
import boto3
s3 = boto3.client(
"s3",
endpoint_url=os.getenv("R2_S3_URL"),
region_name=os.getenv("R2_S3_REGION_NAME"),
aws_access_key_id=os.getenv("R2_S3_ACCESS_ID"),
aws_secret_access_key=os.getenv("R2_S3_ACCESS_KEY"),
)
bucket_name = "litellm-r2-bucket"
# List objects in the bucket
response = s3.list_objects(Bucket=bucket_name)
# # Sort the objects based on the LastModified timestamp
# objects = sorted(
# response["Contents"], key=lambda x: x["LastModified"], reverse=True
# )
# # Get the keys of the most recent objects
# most_recent_keys = [obj["Key"] for obj in objects]
# print(most_recent_keys)
# # for each key, get the part before "-" as the key. Do it safely
# cleaned_keys = []
# for key in most_recent_keys:
# split_key = key.split("-time=")
# cleaned_keys.append(split_key[0])
# print("\n most recent keys", most_recent_keys)
# print("\n cleaned keys", cleaned_keys)
# print("\n Expected keys: ", expected_keys)
# matches = 0
# for key in expected_keys:
# assert key in cleaned_keys
# if key in cleaned_keys:
# matches += 1
# # remove the match key
# cleaned_keys.remove(key)
# # this asserts we log, the first request + the 2nd cached request
# print("we had two matches ! passed ", matches)
# assert matches == 1
# try:
# # cleanup s3 bucket in test
# for key in most_recent_keys:
# s3.delete_object(Bucket=bucket_name, Key=key)
# except:
# # don't let cleanup fail a test
# pass
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
finally:
# post, close log file and verify
# Reset stdout to the original value
print("Passed! Testing async s3 logging")

View file

@ -95,7 +95,8 @@ def test_vertex_ai():
+ litellm.vertex_code_text_models + litellm.vertex_code_text_models
) )
litellm.set_verbose = False litellm.set_verbose = False
litellm.vertex_project = "reliablekeys" vertex_ai_project = "reliablekeys"
# litellm.vertex_project = "reliablekeys"
test_models = random.sample(test_models, 1) test_models = random.sample(test_models, 1)
# test_models += litellm.vertex_language_models # always test gemini-pro # test_models += litellm.vertex_language_models # always test gemini-pro
@ -117,6 +118,7 @@ def test_vertex_ai():
model=model, model=model,
messages=[{"role": "user", "content": "hi"}], messages=[{"role": "user", "content": "hi"}],
temperature=0.7, temperature=0.7,
vertex_ai_project=vertex_ai_project,
) )
print("\nModel Response", response) print("\nModel Response", response)
print(response) print(response)
@ -302,10 +304,7 @@ def test_gemini_pro_vision():
assert prompt_tokens == 263 # the gemini api returns 263 to us assert prompt_tokens == 263 # the gemini api returns 263 to us
except Exception as e: except Exception as e:
import traceback pytest.fail(f"An exception occurred - {str(e)}")
traceback.print_exc()
raise e
# test_gemini_pro_vision() # test_gemini_pro_vision()

View file

@ -70,18 +70,16 @@ def test_completion_with_empty_model():
def test_completion_invalid_param_cohere(): def test_completion_invalid_param_cohere():
try: try:
response = completion(model="command-nightly", messages=messages, top_p=1) litellm.set_verbose = True
print(f"response: {response}") response = completion(model="command-nightly", messages=messages, seed=12)
pytest.fail(f"This should have failed cohere does not support `seed` parameter")
except Exception as e: except Exception as e:
if "Unsupported parameters passed: top_p" in str(e): if " cohere does not support parameters: {'seed': 12}" in str(e):
pass pass
else: else:
pytest.fail(f"An error occurred {e}") pytest.fail(f"An error occurred {e}")
# test_completion_invalid_param_cohere()
def test_completion_function_call_cohere(): def test_completion_function_call_cohere():
try: try:
response = completion( response = completion(

View file

@ -127,9 +127,10 @@ def test_caching_with_models_v2():
] ]
litellm.cache = Cache() litellm.cache = Cache()
print("test2 for caching") print("test2 for caching")
litellm.set_verbose = True
response1 = completion(model="gpt-3.5-turbo", messages=messages, caching=True) response1 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
response2 = completion(model="gpt-3.5-turbo", messages=messages, caching=True) response2 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
response3 = completion(model="command-nightly", messages=messages, caching=True) response3 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
print(f"response1: {response1}") print(f"response1: {response1}")
print(f"response2: {response2}") print(f"response2: {response2}")
print(f"response3: {response3}") print(f"response3: {response3}")
@ -286,7 +287,7 @@ def test_redis_cache_completion():
response3 = completion( response3 = completion(
model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5 model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
) )
response4 = completion(model="command-nightly", messages=messages, caching=True) response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
print("\nresponse 1", response1) print("\nresponse 1", response1)
print("\nresponse 2", response2) print("\nresponse 2", response2)
@ -401,7 +402,7 @@ def test_redis_cache_completion_stream():
""" """
test_redis_cache_completion_stream() # test_redis_cache_completion_stream()
def test_redis_cache_acompletion_stream(): def test_redis_cache_acompletion_stream():
@ -723,8 +724,8 @@ def test_cache_override():
print(f"Embedding 2 response time: {end_time - start_time} seconds") print(f"Embedding 2 response time: {end_time - start_time} seconds")
assert ( assert (
end_time - start_time > 0.1 end_time - start_time > 0.05
) # ensure 2nd response comes in over 0.1s. This should not be cached. ) # ensure 2nd response comes in over 0.05s. This should not be cached.
# test_cache_override() # test_cache_override()

View file

@ -191,6 +191,21 @@ def test_completion_gpt4_turbo():
# test_completion_gpt4_turbo() # test_completion_gpt4_turbo()
def test_completion_gpt4_turbo_0125():
try:
response = completion(
model="gpt-4-0125-preview",
messages=messages,
max_tokens=10,
)
print(response)
except openai.RateLimitError:
print("got a rate liimt error")
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="this test is flaky") @pytest.mark.skip(reason="this test is flaky")
def test_completion_gpt4_vision(): def test_completion_gpt4_vision():
try: try:
@ -224,7 +239,7 @@ def test_completion_gpt4_vision():
def test_completion_azure_gpt4_vision(): def test_completion_azure_gpt4_vision():
# azure gpt-4 vision takes 5s to respond # azure/gpt-4, vision takes 5seconds to respond
try: try:
litellm.set_verbose = True litellm.set_verbose = True
response = completion( response = completion(
@ -268,7 +283,7 @@ def test_completion_azure_gpt4_vision():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
test_completion_azure_gpt4_vision() # test_completion_azure_gpt4_vision()
@pytest.mark.skip(reason="this test is flaky") @pytest.mark.skip(reason="this test is flaky")
@ -500,22 +515,22 @@ def hf_test_completion_tgi():
# hf_test_error_logs() # hf_test_error_logs()
def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky # def test_completion_cohere(): # commenting out,for now as the cohere endpoint is being flaky
try: # try:
litellm.CohereConfig(max_tokens=1000, stop_sequences=["a"]) # litellm.CohereConfig(max_tokens=10, stop_sequences=["a"])
response = completion( # response = completion(
model="command-nightly", messages=messages, logger_fn=logger_fn # model="command-nightly", messages=messages, logger_fn=logger_fn
) # )
# Add any assertions here to check the response # # Add any assertions here to check the response
print(response) # print(response)
response_str = response["choices"][0]["message"]["content"] # response_str = response["choices"][0]["message"]["content"]
response_str_2 = response.choices[0].message.content # response_str_2 = response.choices[0].message.content
if type(response_str) != str: # if type(response_str) != str:
pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
if type(response_str_2) != str: # if type(response_str_2) != str:
pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
except Exception as e: # except Exception as e:
pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
# test_completion_cohere() # test_completion_cohere()
@ -854,7 +869,7 @@ def test_completion_anyscale_with_functions():
def test_completion_azure_key_completion_arg(): def test_completion_azure_key_completion_arg():
# this tests if we can pass api_key to completion, when it's not in the env # this tests if we can pass api_key to completion, when it's not in the env.
# DO NOT REMOVE THIS TEST. No MATTER WHAT Happens! # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
# If you want to remove it, speak to Ishaan! # If you want to remove it, speak to Ishaan!
# Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
@ -990,9 +1005,9 @@ def test_azure_openai_ad_token():
print("azure ad token respoonse\n") print("azure ad token respoonse\n")
print(response) print(response)
litellm.input_callback = [] litellm.input_callback = []
except: except Exception as e:
litellm.input_callback = [] litellm.input_callback = []
pass pytest.fail(f"An exception occurs - {str(e)}")
# test_azure_openai_ad_token() # test_azure_openai_ad_token()
@ -1269,6 +1284,8 @@ def test_completion_together_ai():
"Cost for completion call together-computer/llama-2-70b: ", "Cost for completion call together-computer/llama-2-70b: ",
f"${float(cost):.10f}", f"${float(cost):.10f}",
) )
except litellm.Timeout as e:
pass
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -1370,16 +1387,22 @@ def test_customprompt_together_ai():
def test_completion_sagemaker(): def test_completion_sagemaker():
try: try:
print("testing sagemaker")
litellm.set_verbose = True litellm.set_verbose = True
print("testing sagemaker")
response = completion( response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4", model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=messages, messages=messages,
temperature=0.2, temperature=0.2,
max_tokens=80, max_tokens=80,
input_cost_per_second=0.000420,
) )
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
cost = completion_cost(completion_response=response)
print("calculated cost", cost)
assert (
cost > 0.0 and cost < 1.0
) # should never be > $1 for a single completion call
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -1387,6 +1410,36 @@ def test_completion_sagemaker():
# test_completion_sagemaker() # test_completion_sagemaker()
def test_completion_sagemaker_stream():
try:
litellm.set_verbose = False
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=messages,
temperature=0.2,
max_tokens=80,
stream=True,
)
complete_streaming_response = ""
first_chunk_id, chunk_id = None, None
for i, chunk in enumerate(response):
print(chunk)
chunk_id = chunk.id
print(chunk_id)
if i == 0:
first_chunk_id = chunk_id
else:
assert chunk_id == first_chunk_id
complete_streaming_response += chunk.choices[0].delta.content or ""
# Add any assertions here to check the response
# print(response)
assert len(complete_streaming_response) > 0
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_chat_sagemaker(): def test_completion_chat_sagemaker():
try: try:
messages = [{"role": "user", "content": "Hey, how's it going?"}] messages = [{"role": "user", "content": "Hey, how's it going?"}]

View file

@ -124,7 +124,7 @@ def test_cost_azure_gpt_35():
) )
test_cost_azure_gpt_35() # test_cost_azure_gpt_35()
def test_cost_azure_embedding(): def test_cost_azure_embedding():
@ -158,3 +158,78 @@ def test_cost_azure_embedding():
# test_cost_azure_embedding() # test_cost_azure_embedding()
def test_cost_openai_image_gen():
cost = litellm.completion_cost(
model="dall-e-2", size="1024-x-1024", quality="standard", n=1
)
assert cost == 0.019922944
def test_cost_bedrock_pricing():
"""
- get pricing specific to region for a model
"""
from litellm import ModelResponse, Choices, Message
from litellm.utils import Usage
litellm.set_verbose = True
input_tokens = litellm.token_counter(
model="bedrock/anthropic.claude-instant-v1",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
print(f"input_tokens: {input_tokens}")
output_tokens = litellm.token_counter(
model="bedrock/anthropic.claude-instant-v1",
text="It's all going well",
count_response_tokens=True,
)
print(f"output_tokens: {output_tokens}")
resp = ModelResponse(
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content="It's all going well",
role="assistant",
),
)
],
created=1700775391,
model="anthropic.claude-instant-v1",
object="chat.completion",
system_fingerprint=None,
usage=Usage(
prompt_tokens=input_tokens,
completion_tokens=output_tokens,
total_tokens=input_tokens + output_tokens,
),
)
resp._hidden_params = {
"custom_llm_provider": "bedrock",
"region_name": "ap-northeast-1",
}
cost = litellm.completion_cost(
model="anthropic.claude-instant-v1",
completion_response=resp,
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens
assert cost == predicted_cost
def test_cost_bedrock_pricing_actual_calls():
litellm.set_verbose = True
model = "anthropic.claude-instant-v1"
messages = [{"role": "user", "content": "Hey, how's it going?"}]
response = litellm.completion(model=model, messages=messages)
assert response._hidden_params["region_name"] is not None
cost = litellm.completion_cost(
completion_response=response,
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
assert cost > 0

View file

@ -13,4 +13,4 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
return UserAPIKeyAuth(api_key=api_key) return UserAPIKeyAuth(api_key=api_key)
raise Exception raise Exception
except: except:
raise Exception raise Exception("Failed custom auth")

View file

@ -53,9 +53,9 @@ model_list:
api_key: os.environ/AZURE_API_KEY api_key: os.environ/AZURE_API_KEY
api_version: 2023-07-01-preview api_version: 2023-07-01-preview
model: azure/azure-embedding-model model: azure/azure-embedding-model
model_name: azure-embedding-model
model_info: model_info:
mode: "embedding" mode: embedding
model_name: azure-embedding-model
- litellm_params: - litellm_params:
model: gpt-3.5-turbo model: gpt-3.5-turbo
model_info: model_info:
@ -80,43 +80,49 @@ model_list:
description: this is a test openai model description: this is a test openai model
id: 9b1ef341-322c-410a-8992-903987fef439 id: 9b1ef341-322c-410a-8992-903987fef439
model_name: test_openai_models model_name: test_openai_models
- model_name: amazon-embeddings - litellm_params:
litellm_params: model: bedrock/amazon.titan-embed-text-v1
model: "bedrock/amazon.titan-embed-text-v1"
model_info: model_info:
mode: embedding mode: embedding
- model_name: "GPT-J 6B - Sagemaker Text Embedding (Internal)" model_name: amazon-embeddings
litellm_params: - litellm_params:
model: "sagemaker/berri-benchmarking-gpt-j-6b-fp16" model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
model_info: model_info:
mode: embedding mode: embedding
- model_name: dall-e-3 model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
litellm_params: - litellm_params:
model: dall-e-3 model: dall-e-3
model_info: model_info:
mode: image_generation mode: image_generation
- model_name: dall-e-3 model_name: dall-e-3
litellm_params: - litellm_params:
model: "azure/dall-e-3-test" api_base: os.environ/AZURE_SWEDEN_API_BASE
api_version: "2023-12-01-preview" api_key: os.environ/AZURE_SWEDEN_API_KEY
api_base: "os.environ/AZURE_SWEDEN_API_BASE" api_version: 2023-12-01-preview
api_key: "os.environ/AZURE_SWEDEN_API_KEY" model: azure/dall-e-3-test
model_info: model_info:
mode: image_generation mode: image_generation
- model_name: dall-e-2 model_name: dall-e-3
litellm_params: - litellm_params:
model: "azure/" api_base: os.environ/AZURE_API_BASE
api_version: "2023-06-01-preview" api_key: os.environ/AZURE_API_KEY
api_base: "os.environ/AZURE_API_BASE" api_version: 2023-06-01-preview
api_key: "os.environ/AZURE_API_KEY" model: azure/
model_info: model_info:
mode: image_generation mode: image_generation
- model_name: text-embedding-ada-002 model_name: dall-e-2
litellm_params: - litellm_params:
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: 2023-07-01-preview
model: azure/azure-embedding-model model: azure/azure-embedding-model
api_base: "os.environ/AZURE_API_BASE"
api_key: "os.environ/AZURE_API_KEY"
api_version: "2023-07-01-preview"
model_info: model_info:
mode: embedding
base_model: text-embedding-ada-002 base_model: text-embedding-ada-002
mode: embedding
model_name: text-embedding-ada-002
- litellm_params:
model: gpt-3.5-turbo
model_info:
description: this is a test openai model
id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
model_name: test_openai_models

View file

@ -74,6 +74,7 @@ class CompletionCustomHandler(
def log_post_api_call(self, kwargs, response_obj, start_time, end_time): def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
try: try:
print(f"kwargs: {kwargs}")
self.states.append("post_api_call") self.states.append("post_api_call")
## START TIME ## START TIME
assert isinstance(start_time, datetime) assert isinstance(start_time, datetime)
@ -149,7 +150,14 @@ class CompletionCustomHandler(
## END TIME ## END TIME
assert isinstance(end_time, datetime) assert isinstance(end_time, datetime)
## RESPONSE OBJECT ## RESPONSE OBJECT
assert isinstance(response_obj, litellm.ModelResponse) assert isinstance(
response_obj,
(
litellm.ModelResponse,
litellm.EmbeddingResponse,
litellm.ImageResponse,
),
)
## KWARGS ## KWARGS
assert isinstance(kwargs["model"], str) assert isinstance(kwargs["model"], str)
assert isinstance(kwargs["messages"], list) and isinstance( assert isinstance(kwargs["messages"], list) and isinstance(
@ -170,12 +178,14 @@ class CompletionCustomHandler(
) )
assert isinstance(kwargs["additional_args"], (dict, type(None))) assert isinstance(kwargs["additional_args"], (dict, type(None)))
assert isinstance(kwargs["log_event_type"], str) assert isinstance(kwargs["log_event_type"], str)
assert isinstance(kwargs["response_cost"], (float, type(None)))
except: except:
print(f"Assertion Error: {traceback.format_exc()}") print(f"Assertion Error: {traceback.format_exc()}")
self.errors.append(traceback.format_exc()) self.errors.append(traceback.format_exc())
def log_failure_event(self, kwargs, response_obj, start_time, end_time): def log_failure_event(self, kwargs, response_obj, start_time, end_time):
try: try:
print(f"kwargs: {kwargs}")
self.states.append("sync_failure") self.states.append("sync_failure")
## START TIME ## START TIME
assert isinstance(start_time, datetime) assert isinstance(start_time, datetime)
@ -262,6 +272,7 @@ class CompletionCustomHandler(
assert isinstance(kwargs["additional_args"], (dict, type(None))) assert isinstance(kwargs["additional_args"], (dict, type(None)))
assert isinstance(kwargs["log_event_type"], str) assert isinstance(kwargs["log_event_type"], str)
assert kwargs["cache_hit"] is None or isinstance(kwargs["cache_hit"], bool) assert kwargs["cache_hit"] is None or isinstance(kwargs["cache_hit"], bool)
assert isinstance(kwargs["response_cost"], (float, type(None)))
except: except:
print(f"Assertion Error: {traceback.format_exc()}") print(f"Assertion Error: {traceback.format_exc()}")
self.errors.append(traceback.format_exc()) self.errors.append(traceback.format_exc())
@ -545,8 +556,50 @@ async def test_async_chat_bedrock_stream():
# asyncio.run(test_async_chat_bedrock_stream()) # asyncio.run(test_async_chat_bedrock_stream())
## Test Sagemaker + Async
@pytest.mark.asyncio
async def test_async_chat_sagemaker_stream():
try:
customHandler = CompletionCustomHandler()
litellm.callbacks = [customHandler]
response = await litellm.acompletion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
)
# test streaming
response = await litellm.acompletion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
stream=True,
)
print(f"response: {response}")
async for chunk in response:
print(f"chunk: {chunk}")
continue
## test failure callback
try:
response = await litellm.acompletion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
aws_region_name="my-bad-key",
stream=True,
)
async for chunk in response:
continue
except:
pass
time.sleep(1)
print(f"customHandler.errors: {customHandler.errors}")
assert len(customHandler.errors) == 0
litellm.callbacks = []
except Exception as e:
pytest.fail(f"An exception occurred: {str(e)}")
# Text Completion # Text Completion
## Test OpenAI text completion + Async ## Test OpenAI text completion + Async
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_text_completion_openai_stream(): async def test_async_text_completion_openai_stream():
@ -585,6 +638,7 @@ async def test_async_text_completion_openai_stream():
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred: {str(e)}") pytest.fail(f"An exception occurred: {str(e)}")
# EMBEDDING # EMBEDDING
## Test OpenAI + Async ## Test OpenAI + Async
@pytest.mark.asyncio @pytest.mark.asyncio
@ -762,6 +816,54 @@ async def test_async_embedding_azure_caching():
assert len(customHandler_caching.states) == 4 # pre, post, success, success assert len(customHandler_caching.states) == 4 # pre, post, success, success
# asyncio.run( # Image Generation
# test_async_embedding_azure_caching()
# )
## Test OpenAI + Sync
def test_image_generation_openai():
try:
customHandler_success = CompletionCustomHandler()
customHandler_failure = CompletionCustomHandler()
# litellm.callbacks = [customHandler_success]
# litellm.set_verbose = True
# response = litellm.image_generation(
# prompt="A cute baby sea otter", model="dall-e-3"
# )
# print(f"response: {response}")
# assert len(response.data) > 0
# print(f"customHandler_success.errors: {customHandler_success.errors}")
# print(f"customHandler_success.states: {customHandler_success.states}")
# assert len(customHandler_success.errors) == 0
# assert len(customHandler_success.states) == 3 # pre, post, success
# test failure callback
litellm.callbacks = [customHandler_failure]
try:
response = litellm.image_generation(
prompt="A cute baby sea otter",
model="dall-e-2",
api_key="my-bad-api-key",
)
except:
pass
print(f"customHandler_failure.errors: {customHandler_failure.errors}")
print(f"customHandler_failure.states: {customHandler_failure.states}")
assert len(customHandler_failure.errors) == 0
assert len(customHandler_failure.states) == 3 # pre, post, failure
except litellm.RateLimitError as e:
pass
except litellm.ContentPolicyViolationError:
pass # OpenAI randomly raises these errors - skip when they occur
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
test_image_generation_openai()
## Test OpenAI + Async
## Test Azure + Sync
## Test Azure + Async

View file

@ -1,32 +1,35 @@
### What this tests #### ### What this tests ####
import sys, os, time, inspect, asyncio, traceback import sys, os, time, inspect, asyncio, traceback
import pytest import pytest
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath("../.."))
from litellm import completion, embedding from litellm import completion, embedding
import litellm import litellm
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger): class MyCustomHandler(CustomLogger):
complete_streaming_response_in_callback = "" complete_streaming_response_in_callback = ""
def __init__(self): def __init__(self):
self.success: bool = False # type: ignore self.success: bool = False # type: ignore
self.failure: bool = False # type: ignore self.failure: bool = False # type: ignore
self.async_success: bool = False # type: ignore self.async_success: bool = False # type: ignore
self.async_success_embedding: bool = False # type: ignore self.async_success_embedding: bool = False # type: ignore
self.async_failure: bool = False # type: ignore self.async_failure: bool = False # type: ignore
self.async_failure_embedding: bool = False # type: ignore self.async_failure_embedding: bool = False # type: ignore
self.async_completion_kwargs = None # type: ignore self.async_completion_kwargs = None # type: ignore
self.async_embedding_kwargs = None # type: ignore self.async_embedding_kwargs = None # type: ignore
self.async_embedding_response = None # type: ignore self.async_embedding_response = None # type: ignore
self.async_completion_kwargs_fail = None # type: ignore self.async_completion_kwargs_fail = None # type: ignore
self.async_embedding_kwargs_fail = None # type: ignore self.async_embedding_kwargs_fail = None # type: ignore
self.stream_collected_response = None # type: ignore self.stream_collected_response = None # type: ignore
self.sync_stream_collected_response = None # type: ignore self.sync_stream_collected_response = None # type: ignore
self.user = None # type: ignore self.user = None # type: ignore
self.data_sent_to_api: dict = {} self.data_sent_to_api: dict = {}
def log_pre_api_call(self, model, messages, kwargs): def log_pre_api_call(self, model, messages, kwargs):
@ -45,7 +48,6 @@ class MyCustomHandler(CustomLogger):
if kwargs.get("stream") == True: if kwargs.get("stream") == True:
self.sync_stream_collected_response = response_obj self.sync_stream_collected_response = response_obj
def log_failure_event(self, kwargs, response_obj, start_time, end_time): def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure") print(f"On Failure")
self.failure = True self.failure = True
@ -72,14 +74,20 @@ class MyCustomHandler(CustomLogger):
self.async_completion_kwargs_fail = kwargs self.async_completion_kwargs_fail = kwargs
class TmpFunction: class TmpFunction:
complete_streaming_response_in_callback = "" complete_streaming_response_in_callback = ""
async_success: bool = False async_success: bool = False
async def async_test_logging_fn(self, kwargs, completion_obj, start_time, end_time): async def async_test_logging_fn(self, kwargs, completion_obj, start_time, end_time):
print(f"ON ASYNC LOGGING") print(f"ON ASYNC LOGGING")
self.async_success = True self.async_success = True
print(f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}') print(
self.complete_streaming_response_in_callback = kwargs.get("complete_streaming_response") f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}'
)
self.complete_streaming_response_in_callback = kwargs.get(
"complete_streaming_response"
)
def test_async_chat_openai_stream(): def test_async_chat_openai_stream():
@ -88,29 +96,39 @@ def test_async_chat_openai_stream():
# litellm.set_verbose = True # litellm.set_verbose = True
litellm.success_callback = [tmp_function.async_test_logging_fn] litellm.success_callback = [tmp_function.async_test_logging_fn]
complete_streaming_response = "" complete_streaming_response = ""
async def call_gpt(): async def call_gpt():
nonlocal complete_streaming_response nonlocal complete_streaming_response
response = await litellm.acompletion(model="gpt-3.5-turbo", response = await litellm.acompletion(
messages=[{ model="gpt-3.5-turbo",
"role": "user", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
"content": "Hi 👋 - i'm openai" stream=True,
}], )
stream=True)
async for chunk in response: async for chunk in response:
complete_streaming_response += chunk["choices"][0]["delta"]["content"] or "" complete_streaming_response += (
chunk["choices"][0]["delta"]["content"] or ""
)
print(complete_streaming_response) print(complete_streaming_response)
asyncio.run(call_gpt()) asyncio.run(call_gpt())
complete_streaming_response = complete_streaming_response.strip("'") complete_streaming_response = complete_streaming_response.strip("'")
response1 = tmp_function.complete_streaming_response_in_callback["choices"][0]["message"]["content"] response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
"message"
]["content"]
response2 = complete_streaming_response response2 = complete_streaming_response
# assert [ord(c) for c in response1] == [ord(c) for c in response2] # assert [ord(c) for c in response1] == [ord(c) for c in response2]
print(f"response1: {response1}")
print(f"response2: {response2}")
assert response1 == response2 assert response1 == response2
assert tmp_function.async_success == True assert tmp_function.async_success == True
except Exception as e: except Exception as e:
print(e) print(e)
pytest.fail(f"An error occurred - {str(e)}") pytest.fail(f"An error occurred - {str(e)}")
# test_async_chat_openai_stream() # test_async_chat_openai_stream()
def test_completion_azure_stream_moderation_failure(): def test_completion_azure_stream_moderation_failure():
try: try:
customHandler = MyCustomHandler() customHandler = MyCustomHandler()
@ -152,27 +170,32 @@ def test_async_custom_handler_stream():
}, },
] ]
complete_streaming_response = "" complete_streaming_response = ""
async def test_1(): async def test_1():
nonlocal complete_streaming_response nonlocal complete_streaming_response
response = await litellm.acompletion( response = await litellm.acompletion(
model="azure/chatgpt-v-2", model="azure/chatgpt-v-2", messages=messages, stream=True
messages=messages,
stream=True
) )
async for chunk in response: async for chunk in response:
complete_streaming_response += chunk["choices"][0]["delta"]["content"] or "" complete_streaming_response += (
chunk["choices"][0]["delta"]["content"] or ""
)
print(complete_streaming_response) print(complete_streaming_response)
asyncio.run(test_1()) asyncio.run(test_1())
response_in_success_handler = customHandler2.stream_collected_response response_in_success_handler = customHandler2.stream_collected_response
response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"] response_in_success_handler = response_in_success_handler["choices"][0][
"message"
]["content"]
print("\n\n") print("\n\n")
print("response_in_success_handler: ", response_in_success_handler) print("response_in_success_handler: ", response_in_success_handler)
print("complete_streaming_response: ", complete_streaming_response) print("complete_streaming_response: ", complete_streaming_response)
assert response_in_success_handler == complete_streaming_response assert response_in_success_handler == complete_streaming_response
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# test_async_custom_handler_stream() # test_async_custom_handler_stream()
@ -183,28 +206,28 @@ def test_azure_completion_stream():
# checks if the model response available in the async + stream callbacks is equal to the received response # checks if the model response available in the async + stream callbacks is equal to the received response
customHandler2 = MyCustomHandler() customHandler2 = MyCustomHandler()
litellm.callbacks = [customHandler2] litellm.callbacks = [customHandler2]
litellm.set_verbose = False litellm.set_verbose = True
messages = [ messages = [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{ {
"role": "user", "role": "user",
"content": "write 1 sentence about litellm being amazing", "content": f"write 1 sentence about litellm being amazing {time.time()}",
}, },
] ]
complete_streaming_response = "" complete_streaming_response = ""
response = litellm.completion( response = litellm.completion(
model="azure/chatgpt-v-2", model="azure/chatgpt-v-2", messages=messages, stream=True
messages=messages,
stream=True
) )
for chunk in response: for chunk in response:
complete_streaming_response += chunk["choices"][0]["delta"]["content"] or "" complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
print(complete_streaming_response) print(complete_streaming_response)
time.sleep(0.5) # wait 1/2 second before checking callbacks time.sleep(0.5) # wait 1/2 second before checking callbacks
response_in_success_handler = customHandler2.sync_stream_collected_response response_in_success_handler = customHandler2.sync_stream_collected_response
response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"] response_in_success_handler = response_in_success_handler["choices"][0][
"message"
]["content"]
print("\n\n") print("\n\n")
print("response_in_success_handler: ", response_in_success_handler) print("response_in_success_handler: ", response_in_success_handler)
print("complete_streaming_response: ", complete_streaming_response) print("complete_streaming_response: ", complete_streaming_response)
@ -212,6 +235,7 @@ def test_azure_completion_stream():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_custom_handler_completion(): async def test_async_custom_handler_completion():
try: try:
@ -221,15 +245,22 @@ async def test_async_custom_handler_completion():
assert customHandler_success.async_success == False assert customHandler_success.async_success == False
litellm.callbacks = [customHandler_success] litellm.callbacks = [customHandler_success]
response = await litellm.acompletion( response = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[{ messages=[
{
"role": "user", "role": "user",
"content": "hello from litellm test", "content": "hello from litellm test",
}] }
) ],
)
await asyncio.sleep(1) await asyncio.sleep(1)
assert customHandler_success.async_success == True, "async success is not set to True even after success" assert (
assert customHandler_success.async_completion_kwargs.get("model") == "gpt-3.5-turbo" customHandler_success.async_success == True
), "async success is not set to True even after success"
assert (
customHandler_success.async_completion_kwargs.get("model")
== "gpt-3.5-turbo"
)
# failure # failure
litellm.callbacks = [customHandler_failure] litellm.callbacks = [customHandler_failure]
messages = [ messages = [
@ -243,21 +274,34 @@ async def test_async_custom_handler_completion():
assert customHandler_failure.async_failure == False assert customHandler_failure.async_failure == False
try: try:
response = await litellm.acompletion( response = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=messages, messages=messages,
api_key="my-bad-key", api_key="my-bad-key",
) )
except: except:
pass pass
assert customHandler_failure.async_failure == True, "async failure is not set to True even after failure" assert (
assert customHandler_failure.async_completion_kwargs_fail.get("model") == "gpt-3.5-turbo" customHandler_failure.async_failure == True
assert len(str(customHandler_failure.async_completion_kwargs_fail.get("exception"))) > 10 # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n response = await openai_aclient.chat.completions.create(**data)\n File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119 ), "async failure is not set to True even after failure"
assert (
customHandler_failure.async_completion_kwargs_fail.get("model")
== "gpt-3.5-turbo"
)
assert (
len(
str(customHandler_failure.async_completion_kwargs_fail.get("exception"))
)
> 10
) # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n response = await openai_aclient.chat.completions.create(**data)\n File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
litellm.callbacks = [] litellm.callbacks = []
print("Passed setting async failure") print("Passed setting async failure")
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
# asyncio.run(test_async_custom_handler_completion()) # asyncio.run(test_async_custom_handler_completion())
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_custom_handler_embedding(): async def test_async_custom_handler_embedding():
try: try:
@ -266,31 +310,54 @@ async def test_async_custom_handler_embedding():
# success # success
assert customHandler_embedding.async_success_embedding == False assert customHandler_embedding.async_success_embedding == False
response = await litellm.aembedding( response = await litellm.aembedding(
model="text-embedding-ada-002", model="text-embedding-ada-002",
input = ["hello world"], input=["hello world"],
) )
await asyncio.sleep(1) await asyncio.sleep(1)
assert customHandler_embedding.async_success_embedding == True, "async_success_embedding is not set to True even after success" assert (
assert customHandler_embedding.async_embedding_kwargs.get("model") == "text-embedding-ada-002" customHandler_embedding.async_success_embedding == True
assert customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"] ==2 ), "async_success_embedding is not set to True even after success"
assert (
customHandler_embedding.async_embedding_kwargs.get("model")
== "text-embedding-ada-002"
)
assert (
customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"]
== 2
)
print("Passed setting async success: Embedding") print("Passed setting async success: Embedding")
# failure # failure
assert customHandler_embedding.async_failure_embedding == False assert customHandler_embedding.async_failure_embedding == False
try: try:
response = await litellm.aembedding( response = await litellm.aembedding(
model="text-embedding-ada-002", model="text-embedding-ada-002",
input = ["hello world"], input=["hello world"],
api_key="my-bad-key", api_key="my-bad-key",
) )
except: except:
pass pass
assert customHandler_embedding.async_failure_embedding == True, "async failure embedding is not set to True even after failure" assert (
assert customHandler_embedding.async_embedding_kwargs_fail.get("model") == "text-embedding-ada-002" customHandler_embedding.async_failure_embedding == True
assert len(str(customHandler_embedding.async_embedding_kwargs_fail.get("exception"))) > 10 # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n response = await openai_aclient.chat.completions.create(**data)\n File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119 ), "async failure embedding is not set to True even after failure"
assert (
customHandler_embedding.async_embedding_kwargs_fail.get("model")
== "text-embedding-ada-002"
)
assert (
len(
str(
customHandler_embedding.async_embedding_kwargs_fail.get("exception")
)
)
> 10
) # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n response = await openai_aclient.chat.completions.create(**data)\n File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
# asyncio.run(test_async_custom_handler_embedding()) # asyncio.run(test_async_custom_handler_embedding())
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_custom_handler_embedding_optional_param(): async def test_async_custom_handler_embedding_optional_param():
""" """
@ -300,16 +367,19 @@ async def test_async_custom_handler_embedding_optional_param():
customHandler_optional_params = MyCustomHandler() customHandler_optional_params = MyCustomHandler()
litellm.callbacks = [customHandler_optional_params] litellm.callbacks = [customHandler_optional_params]
response = await litellm.aembedding( response = await litellm.aembedding(
model="azure/azure-embedding-model", model="azure/azure-embedding-model", input=["hello world"], user="John"
input = ["hello world"], )
user = "John" await asyncio.sleep(1) # success callback is async
)
await asyncio.sleep(1) # success callback is async
assert customHandler_optional_params.user == "John" assert customHandler_optional_params.user == "John"
assert customHandler_optional_params.user == customHandler_optional_params.data_sent_to_api["user"] assert (
customHandler_optional_params.user
== customHandler_optional_params.data_sent_to_api["user"]
)
# asyncio.run(test_async_custom_handler_embedding_optional_param()) # asyncio.run(test_async_custom_handler_embedding_optional_param())
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_custom_handler_embedding_optional_param_bedrock(): async def test_async_custom_handler_embedding_optional_param_bedrock():
""" """
@ -323,42 +393,68 @@ async def test_async_custom_handler_embedding_optional_param_bedrock():
customHandler_optional_params = MyCustomHandler() customHandler_optional_params = MyCustomHandler()
litellm.callbacks = [customHandler_optional_params] litellm.callbacks = [customHandler_optional_params]
response = await litellm.aembedding( response = await litellm.aembedding(
model="bedrock/amazon.titan-embed-text-v1", model="bedrock/amazon.titan-embed-text-v1", input=["hello world"], user="John"
input = ["hello world"], )
user = "John" await asyncio.sleep(1) # success callback is async
)
await asyncio.sleep(1) # success callback is async
assert customHandler_optional_params.user == "John" assert customHandler_optional_params.user == "John"
assert "user" not in customHandler_optional_params.data_sent_to_api assert "user" not in customHandler_optional_params.data_sent_to_api
def test_redis_cache_completion_stream(): def test_redis_cache_completion_stream():
from litellm import Cache from litellm import Cache
# Important Test - This tests if we can add to streaming cache, when custom callbacks are set # Important Test - This tests if we can add to streaming cache, when custom callbacks are set
import random import random
try: try:
print("\nrunning test_redis_cache_completion_stream") print("\nrunning test_redis_cache_completion_stream")
litellm.set_verbose = True litellm.set_verbose = True
random_number = random.randint(1, 100000) # add a random number to ensure it's always adding / reading from cache random_number = random.randint(
messages = [{"role": "user", "content": f"write a one sentence poem about: {random_number}"}] 1, 100000
litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD']) ) # add a random number to ensure it's always adding / reading from cache
messages = [
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
}
]
litellm.cache = Cache(
type="redis",
host=os.environ["REDIS_HOST"],
port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"],
)
print("test for caching, streaming + completion") print("test for caching, streaming + completion")
response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True) response1 = completion(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=40,
temperature=0.2,
stream=True,
)
response_1_content = "" response_1_content = ""
for chunk in response1: for chunk in response1:
print(chunk) print(chunk)
response_1_content += chunk.choices[0].delta.content or "" response_1_content += chunk.choices[0].delta.content or ""
print(response_1_content) print(response_1_content)
time.sleep(0.1) # sleep for 0.1 seconds allow set cache to occur time.sleep(0.1) # sleep for 0.1 seconds allow set cache to occur
response2 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True) response2 = completion(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=40,
temperature=0.2,
stream=True,
)
response_2_content = "" response_2_content = ""
for chunk in response2: for chunk in response2:
print(chunk) print(chunk)
response_2_content += chunk.choices[0].delta.content or "" response_2_content += chunk.choices[0].delta.content or ""
print("\nresponse 1", response_1_content) print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content) print("\nresponse 2", response_2_content)
assert response_1_content == response_2_content, f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}" assert (
response_1_content == response_2_content
), f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
litellm.success_callback = [] litellm.success_callback = []
litellm._async_success_callback = [] litellm._async_success_callback = []
litellm.cache = None litellm.cache = None
@ -366,4 +462,6 @@ def test_redis_cache_completion_stream():
print(e) print(e)
litellm.success_callback = [] litellm.success_callback = []
raise e raise e
# test_redis_cache_completion_stream() # test_redis_cache_completion_stream()

View file

@ -33,6 +33,7 @@ def pre_request():
import re import re
@pytest.mark.skip
def verify_log_file(log_file_path): def verify_log_file(log_file_path):
with open(log_file_path, "r") as log_file: with open(log_file_path, "r") as log_file:
log_content = log_file.read() log_content = log_file.read()
@ -123,7 +124,7 @@ def test_dynamo_logging():
sys.stdout = original_stdout sys.stdout = original_stdout
# Close the file # Close the file
log_file.close() log_file.close()
verify_log_file(file_name) # verify_log_file(file_name)
print("Passed! Testing async dynamoDB logging") print("Passed! Testing async dynamoDB logging")

View file

@ -10,7 +10,7 @@ sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
) # Adds the parent directory to the system path ) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion, completion_cost
litellm.set_verbose = False litellm.set_verbose = False
@ -57,6 +57,48 @@ def test_openai_embedding():
# test_openai_embedding() # test_openai_embedding()
def test_openai_embedding_3():
try:
litellm.set_verbose = True
response = embedding(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
metadata={"anything": "good day"},
dimensions=5,
)
print(f"response:", response)
litellm_response = dict(response)
litellm_response_keys = set(litellm_response.keys())
litellm_response_keys.discard("_response_ms")
print(litellm_response_keys)
print("LiteLLM Response\n")
# print(litellm_response)
# same request with OpenAI 1.0+
import openai
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
response = client.embeddings.create(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
dimensions=5,
)
response = dict(response)
openai_response_keys = set(response.keys())
print(openai_response_keys)
assert (
litellm_response_keys == openai_response_keys
) # ENSURE the Keys in litellm response is exactly what the openai package returns
assert (
len(litellm_response["data"]) == 2
) # expect two embedding responses from litellm_response since input had two
print(openai_response_keys)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_openai_azure_embedding_simple(): def test_openai_azure_embedding_simple():
try: try:
litellm.set_verbose = True litellm.set_verbose = True
@ -186,7 +228,7 @@ def test_cohere_embedding3():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
test_cohere_embedding3() # test_cohere_embedding3()
def test_bedrock_embedding_titan(): def test_bedrock_embedding_titan():
@ -341,8 +383,30 @@ def test_sagemaker_embeddings():
response = litellm.embedding( response = litellm.embedding(
model="sagemaker/berri-benchmarking-gpt-j-6b-fp16", model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
input=["good morning from litellm", "this is another item"], input=["good morning from litellm", "this is another item"],
input_cost_per_second=0.000420,
) )
print(f"response: {response}") print(f"response: {response}")
cost = completion_cost(completion_response=response)
assert (
cost > 0.0 and cost < 1.0
) # should never be > $1 for a single embedding call
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio
async def test_sagemaker_aembeddings():
try:
response = await litellm.aembedding(
model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
input=["good morning from litellm", "this is another item"],
input_cost_per_second=0.000420,
)
print(f"response: {response}")
cost = completion_cost(completion_response=response)
assert (
cost > 0.0 and cost < 1.0
) # should never be > $1 for a single embedding call
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")

View file

@ -2,7 +2,7 @@ from openai import AuthenticationError, BadRequestError, RateLimitError, OpenAIE
import os import os
import sys import sys
import traceback import traceback
import subprocess import subprocess, asyncio
sys.path.insert( sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
@ -378,6 +378,74 @@ def test_content_policy_exceptionimage_generation_openai():
# test_content_policy_exceptionimage_generation_openai() # test_content_policy_exceptionimage_generation_openai()
def test_content_policy_violation_error_streaming():
"""
Production Test.
"""
litellm.set_verbose = False
print("test_async_completion with stream")
async def test_get_response():
try:
response = await litellm.acompletion(
model="azure/chatgpt-v-2",
messages=[{"role": "user", "content": "say 1"}],
temperature=0,
top_p=1,
stream=True,
max_tokens=512,
presence_penalty=0,
frequency_penalty=0,
)
print(f"response: {response}")
num_finish_reason = 0
async for chunk in response:
print(chunk)
if chunk["choices"][0].get("finish_reason") is not None:
num_finish_reason += 1
print("finish_reason", chunk["choices"][0].get("finish_reason"))
assert (
num_finish_reason == 1
), f"expected only one finish reason. Got {num_finish_reason}"
except Exception as e:
pytest.fail(f"GOT exception for gpt-3.5 instruct In streaming{e}")
asyncio.run(test_get_response())
async def test_get_error():
try:
response = await litellm.acompletion(
model="azure/chatgpt-v-2",
messages=[
{"role": "user", "content": "where do i buy lethal drugs from"}
],
temperature=0,
top_p=1,
stream=True,
max_tokens=512,
presence_penalty=0,
frequency_penalty=0,
)
print(f"response: {response}")
num_finish_reason = 0
async for chunk in response:
print(chunk)
if chunk["choices"][0].get("finish_reason") is not None:
num_finish_reason += 1
print("finish_reason", chunk["choices"][0].get("finish_reason"))
pytest.fail(f"Expected to return 400 error In streaming{e}")
except Exception as e:
pass
asyncio.run(test_get_error())
# tesy_async_acompletion()
# # test_invalid_request_error(model="command-nightly") # # test_invalid_request_error(model="command-nightly")
# # Test 3: Rate Limit Errors # # Test 3: Rate Limit Errors
# def test_model_call(model): # def test_model_call(model):

View file

@ -25,10 +25,15 @@ sys.path.insert(
) # Adds the parent directory to the system path ) # Adds the parent directory to the system path
import pytest, logging, asyncio import pytest, logging, asyncio
import litellm, asyncio import litellm, asyncio
from litellm.proxy.proxy_server import new_user, user_api_key_auth, user_update from litellm.proxy.proxy_server import (
new_user,
user_api_key_auth,
user_update,
generate_key_fn,
)
from litellm.proxy._types import NewUserRequest, DynamoDBArgs from litellm.proxy._types import NewUserRequest, DynamoDBArgs, GenerateKeyRequest
from litellm.proxy.utils import DBClient from litellm.proxy.utils import DBClient, hash_token
from starlette.datastructures import URL from starlette.datastructures import URL
@ -104,13 +109,17 @@ def test_call_with_invalid_key(custom_db_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.detail) print(e.message)
assert "Authentication Error" in e.detail assert "Authentication Error" in e.message
pass pass
def test_call_with_invalid_model(custom_db_client): def test_call_with_invalid_model(custom_db_client):
# 3. Make a call to a key with an invalid model - expect to fail # 3. Make a call to a key with an invalid model - expect to fail
from litellm._logging import verbose_proxy_logger
import logging
verbose_proxy_logger.setLevel(logging.DEBUG)
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client) setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
try: try:
@ -138,7 +147,7 @@ def test_call_with_invalid_model(custom_db_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
assert ( assert (
e.detail e.message
== "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision" == "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision"
) )
pass pass
@ -175,10 +184,16 @@ def test_call_with_valid_model(custom_db_client):
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
def test_call_with_key_over_budget(custom_db_client): def test_call_with_user_over_budget(custom_db_client):
# 5. Make a call with a key over budget, expect to fail # 5. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client) setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
from litellm._logging import verbose_proxy_logger, verbose_logger
import logging
litellm.set_verbose = True
verbose_logger.setLevel(logging.DEBUG)
verbose_proxy_logger.setLevel(logging.DEBUG)
try: try:
async def test(): async def test():
@ -221,10 +236,11 @@ def test_call_with_key_over_budget(custom_db_client):
"stream": False, "stream": False,
"litellm_params": { "litellm_params": {
"metadata": { "metadata": {
"user_api_key": generated_key, "user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id, "user_api_key_user_id": user_id,
} }
}, },
"response_cost": 0.00002,
}, },
completion_response=resp, completion_response=resp,
) )
@ -236,12 +252,12 @@ def test_call_with_key_over_budget(custom_db_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
error_detail = e.detail error_detail = e.message
assert "Authentication Error, ExceededBudget:" in error_detail assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e)) print(vars(e))
def test_call_with_key_over_budget_stream(custom_db_client): def test_call_with_user_over_budget_stream(custom_db_client):
# 6. Make a call with a key over budget, expect to fail # 6. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client) setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -293,10 +309,11 @@ def test_call_with_key_over_budget_stream(custom_db_client):
"complete_streaming_response": resp, "complete_streaming_response": resp,
"litellm_params": { "litellm_params": {
"metadata": { "metadata": {
"user_api_key": generated_key, "user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id, "user_api_key_user_id": user_id,
} }
}, },
"response_cost": 0.00002,
}, },
completion_response=ModelResponse(), completion_response=ModelResponse(),
) )
@ -308,6 +325,179 @@ def test_call_with_key_over_budget_stream(custom_db_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
error_detail = e.detail error_detail = e.message
assert "Authentication Error, ExceededBudget:" in error_detail assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e)) print(vars(e))
def test_call_with_user_key_budget(custom_db_client):
# 7. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
from litellm._logging import verbose_proxy_logger
import logging
verbose_proxy_logger.setLevel(logging.DEBUG)
try:
async def test():
request = GenerateKeyRequest(max_budget=0.00001)
key = await generate_key_fn(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
resp = ModelResponse(
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"stream": False,
"litellm_params": {
"metadata": {
"user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00002,
},
completion_response=resp,
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
asyncio.run(test())
except Exception as e:
error_detail = e.message
assert "Authentication Error, ExceededTokenBudget:" in error_detail
print(vars(e))
def test_call_with_key_over_budget_stream(custom_db_client):
# 8. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
from litellm._logging import verbose_proxy_logger
import logging
litellm.set_verbose = True
verbose_proxy_logger.setLevel(logging.DEBUG)
try:
async def test():
request = GenerateKeyRequest(max_budget=0.00001)
key = await generate_key_fn(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
resp = ModelResponse(
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"stream": True,
"complete_streaming_response": resp,
"litellm_params": {
"metadata": {
"user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00002,
},
completion_response=ModelResponse(),
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
asyncio.run(test())
except Exception as e:
error_detail = e.message
assert "Authentication Error, ExceededTokenBudget:" in error_detail
print(vars(e))
def test_dynamo_db_migration(custom_db_client):
# Tests the temporary patch we have in place
setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "user_custom_auth", None)
try:
async def test():
bearer_token = (
"Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
) # this works with ishaan's db, it's a never expiring key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
async def return_body():
return b'{"model": "azure-models"}'
request.body = return_body
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
asyncio.run(test())
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")

View file

@ -3,11 +3,17 @@
# 2. Make a call with invalid key, expect it to fail # 2. Make a call with invalid key, expect it to fail
# 3. Make a call to a key with invalid model - expect to fail # 3. Make a call to a key with invalid model - expect to fail
# 4. Make a call to a key with valid model - expect to pass # 4. Make a call to a key with valid model - expect to pass
# 5. Make a call with key over budget, expect to fail # 5. Make a call with user over budget, expect to fail
# 6. Make a streaming chat/completions call with key over budget, expect to fail # 6. Make a streaming chat/completions call with user over budget, expect to fail
# 7. Make a call with an key that never expires, expect to pass # 7. Make a call with an key that never expires, expect to pass
# 8. Make a call with an expired key, expect to fail # 8. Make a call with an expired key, expect to fail
# 9. Delete a Key # 9. Delete a Key
# 10. Generate a key, call key/info. Assert info returned is the same as generated key info
# 11. Generate a Key, cal key/info, call key/update, call key/info
# 12. Make a call with key over budget, expect to fail
# 14. Make a streaming chat/completions call with key over budget, expect to fail
# 15. Generate key, when `allow_user_auth`=False - check if `/key/info` returns key_name=null
# 16. Generate key, when `allow_user_auth`=True - check if `/key/info` returns key_name=sk...<last-4-digits>
# function to call to generate key - async def new_user(data: NewUserRequest): # function to call to generate key - async def new_user(data: NewUserRequest):
@ -17,9 +23,10 @@ import sys, os
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
from fastapi import Request from fastapi import Request
from datetime import datetime
load_dotenv() load_dotenv()
import os, io import os, io, time
# this file is to test litellm/proxy # this file is to test litellm/proxy
@ -30,16 +37,30 @@ import pytest, logging, asyncio
import litellm, asyncio import litellm, asyncio
from litellm.proxy.proxy_server import ( from litellm.proxy.proxy_server import (
new_user, new_user,
generate_key_fn,
user_api_key_auth, user_api_key_auth,
user_update, user_update,
delete_key_fn, delete_key_fn,
info_key_fn,
update_key_fn,
generate_key_fn,
spend_user_fn,
spend_key_fn,
view_spend_logs,
) )
from litellm.proxy.utils import PrismaClient, ProxyLogging from litellm.proxy.utils import PrismaClient, ProxyLogging, hash_token
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
verbose_proxy_logger.setLevel(level=logging.DEBUG) verbose_proxy_logger.setLevel(level=logging.DEBUG)
from litellm.proxy._types import NewUserRequest, DynamoDBArgs, DeleteKeyRequest from litellm.proxy._types import (
NewUserRequest,
GenerateKeyRequest,
DynamoDBArgs,
DeleteKeyRequest,
UpdateKeyRequest,
GenerateKeyRequest,
)
from litellm.proxy.utils import DBClient from litellm.proxy.utils import DBClient
from starlette.datastructures import URL from starlette.datastructures import URL
from litellm.caching import DualCache from litellm.caching import DualCache
@ -64,6 +85,10 @@ def prisma_client():
# Reset litellm.proxy.proxy_server.prisma_client to None # Reset litellm.proxy.proxy_server.prisma_client to None
litellm.proxy.proxy_server.custom_db_client = None litellm.proxy.proxy_server.custom_db_client = None
litellm.proxy.proxy_server.litellm_proxy_budget_name = (
f"litellm-proxy-budget-{time.time()}"
)
litellm.proxy.proxy_server.user_custom_key_generate = None
return prisma_client return prisma_client
@ -120,8 +145,8 @@ def test_call_with_invalid_key(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.detail) print(e.message)
assert "Authentication Error" in e.detail assert "Authentication Error" in e.message
pass pass
@ -155,7 +180,7 @@ def test_call_with_invalid_model(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
assert ( assert (
e.detail e.message
== "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision" == "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision"
) )
pass pass
@ -193,7 +218,7 @@ def test_call_with_valid_model(prisma_client):
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
def test_call_with_key_over_budget(prisma_client): def test_call_with_user_over_budget(prisma_client):
# 5. Make a call with a key over budget, expect to fail # 5. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -244,8 +269,11 @@ def test_call_with_key_over_budget(prisma_client):
"user_api_key_user_id": user_id, "user_api_key_user_id": user_id,
} }
}, },
"response_cost": 0.00002,
}, },
completion_response=resp, completion_response=resp,
start_time=datetime.now(),
end_time=datetime.now(),
) )
# use generated key to auth in # use generated key to auth in
@ -255,12 +283,96 @@ def test_call_with_key_over_budget(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
error_detail = e.detail error_detail = e.message
assert "Authentication Error, ExceededBudget:" in error_detail assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e)) print(vars(e))
def test_call_with_key_over_budget_stream(prisma_client): def test_call_with_proxy_over_budget(prisma_client):
# 5.1 Make a call with a proxy over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
litellm_proxy_budget_name = f"litellm-proxy-budget-{time.time()}"
setattr(
litellm.proxy.proxy_server,
"litellm_proxy_budget_name",
litellm_proxy_budget_name,
)
try:
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
## CREATE PROXY + USER BUDGET ##
request = NewUserRequest(
max_budget=0.00001, user_id=litellm_proxy_budget_name
)
await new_user(request)
request = NewUserRequest()
key = await new_user(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
resp = ModelResponse(
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"stream": False,
"litellm_params": {
"metadata": {
"user_api_key": generated_key,
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00002,
},
completion_response=resp,
start_time=datetime.now(),
end_time=datetime.now(),
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
asyncio.run(test())
except Exception as e:
if hasattr(e, "message"):
error_detail = e.message
else:
error_detail = traceback.format_exc()
assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e))
def test_call_with_user_over_budget_stream(prisma_client):
# 6. Make a call with a key over budget, expect to fail # 6. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -317,8 +429,11 @@ def test_call_with_key_over_budget_stream(prisma_client):
"user_api_key_user_id": user_id, "user_api_key_user_id": user_id,
} }
}, },
"response_cost": 0.00002,
}, },
completion_response=ModelResponse(), completion_response=ModelResponse(),
start_time=datetime.now(),
end_time=datetime.now(),
) )
# use generated key to auth in # use generated key to auth in
@ -328,7 +443,94 @@ def test_call_with_key_over_budget_stream(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
error_detail = e.detail error_detail = e.message
assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e))
def test_call_with_proxy_over_budget_stream(prisma_client):
# 6.1 Make a call with a global proxy over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
litellm_proxy_budget_name = f"litellm-proxy-budget-{time.time()}"
setattr(
litellm.proxy.proxy_server,
"litellm_proxy_budget_name",
litellm_proxy_budget_name,
)
from litellm._logging import verbose_proxy_logger
import logging
litellm.set_verbose = True
verbose_proxy_logger.setLevel(logging.DEBUG)
try:
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
## CREATE PROXY + USER BUDGET ##
request = NewUserRequest(
max_budget=0.00001, user_id=litellm_proxy_budget_name
)
await new_user(request)
request = NewUserRequest()
key = await new_user(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
resp = ModelResponse(
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"stream": True,
"complete_streaming_response": resp,
"litellm_params": {
"metadata": {
"user_api_key": generated_key,
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00002,
},
completion_response=ModelResponse(),
start_time=datetime.now(),
end_time=datetime.now(),
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
asyncio.run(test())
except Exception as e:
error_detail = e.message
assert "Authentication Error, ExceededBudget:" in error_detail assert "Authentication Error, ExceededBudget:" in error_detail
print(vars(e)) print(vars(e))
@ -392,8 +594,8 @@ def test_generate_and_call_with_expired_key(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.detail) print(e.message)
assert "Authentication Error" in e.detail assert "Authentication Error" in e.message
pass pass
@ -415,15 +617,10 @@ def test_delete_key(prisma_client):
generated_key = key.key generated_key = key.key
bearer_token = "Bearer " + generated_key bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
delete_key_request = DeleteKeyRequest(keys=[generated_key]) delete_key_request = DeleteKeyRequest(keys=[generated_key])
# delete the key # delete the key
result_delete_key = await delete_key_fn( result_delete_key = await delete_key_fn(data=delete_key_request)
request=request, data=delete_key_request
)
print("result from delete key", result_delete_key) print("result from delete key", result_delete_key)
assert result_delete_key == {"deleted_keys": [generated_key]} assert result_delete_key == {"deleted_keys": [generated_key]}
@ -450,15 +647,10 @@ def test_delete_key_auth(prisma_client):
generated_key = key.key generated_key = key.key
bearer_token = "Bearer " + generated_key bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
delete_key_request = DeleteKeyRequest(keys=[generated_key]) delete_key_request = DeleteKeyRequest(keys=[generated_key])
# delete the key # delete the key
result_delete_key = await delete_key_fn( result_delete_key = await delete_key_fn(data=delete_key_request)
request=request, data=delete_key_request
)
print("result from delete key", result_delete_key) print("result from delete key", result_delete_key)
assert result_delete_key == {"deleted_keys": [generated_key]} assert result_delete_key == {"deleted_keys": [generated_key]}
@ -474,6 +666,549 @@ def test_delete_key_auth(prisma_client):
asyncio.run(test()) asyncio.run(test())
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.detail) print(e.message)
assert "Authentication Error" in e.detail assert "Authentication Error" in e.message
pass pass
def test_generate_and_call_key_info(prisma_client):
# 10. Generate a Key, cal key/info
print("prisma client=", prisma_client)
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
try:
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
request = NewUserRequest(
metadata={"team": "litellm-team3", "project": "litellm-project3"}
)
key = await new_user(request)
print(key)
generated_key = key.key
# use generated key to auth in
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert result["key"] == generated_key
print("\n info for key=", result["info"])
assert result["info"]["max_parallel_requests"] == None
assert result["info"]["metadata"] == {
"team": "litellm-team3",
"project": "litellm-project3",
}
# cleanup - delete key
delete_key_request = DeleteKeyRequest(keys=[generated_key])
# delete the key
await delete_key_fn(data=delete_key_request)
asyncio.run(test())
except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}")
def test_generate_and_update_key(prisma_client):
# 11. Generate a Key, cal key/info, call key/update, call key/info
# Check if data gets updated
# Check if untouched data does not get updated
print("prisma client=", prisma_client)
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
try:
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
request = NewUserRequest(
metadata={"team": "litellm-team3", "project": "litellm-project3"},
team_id="litellm-core-infra@gmail.com",
)
key = await new_user(request)
print(key)
generated_key = key.key
# use generated key to auth in
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert result["key"] == generated_key
print("\n info for key=", result["info"])
assert result["info"]["max_parallel_requests"] == None
assert result["info"]["metadata"] == {
"team": "litellm-team3",
"project": "litellm-project3",
}
assert result["info"]["team_id"] == "litellm-core-infra@gmail.com"
request = Request(scope={"type": "http"})
request._url = URL(url="/update/key")
# update the key
await update_key_fn(
request=Request,
data=UpdateKeyRequest(
key=generated_key,
models=["ada", "babbage", "curie", "davinci"],
),
)
# get info on key after update
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert result["key"] == generated_key
print("\n info for key=", result["info"])
assert result["info"]["max_parallel_requests"] == None
assert result["info"]["metadata"] == {
"team": "litellm-team3",
"project": "litellm-project3",
}
assert result["info"]["models"] == ["ada", "babbage", "curie", "davinci"]
# cleanup - delete key
delete_key_request = DeleteKeyRequest(keys=[generated_key])
# delete the key
await delete_key_fn(data=delete_key_request)
asyncio.run(test())
except Exception as e:
print("Got Exception", e)
print(e.message)
pytest.fail(f"An exception occurred - {str(e)}")
def test_key_generate_with_custom_auth(prisma_client):
# custom - generate key function
async def custom_generate_key_fn(data: GenerateKeyRequest) -> dict:
"""
Asynchronous function for generating a key based on the input data.
Args:
data (GenerateKeyRequest): The input data for key generation.
Returns:
dict: A dictionary containing the decision and an optional message.
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
"""
# decide if a key should be generated or not
print("using custom auth function!")
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
# only team_id="litellm-core-infra@gmail.com" can make keys
return {
"decision": True,
}
else:
print("Failed custom auth")
return {
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(
litellm.proxy.proxy_server, "user_custom_key_generate", custom_generate_key_fn
)
try:
async def test():
try:
await litellm.proxy.proxy_server.prisma_client.connect()
request = GenerateKeyRequest()
key = await generate_key_fn(request)
pytest.fail(f"Expected an exception. Got {key}")
except Exception as e:
# this should fail
print("Got Exception", e)
print(e.message)
print("First request failed!. This is expected")
assert (
"This violates LiteLLM Proxy Rules. No team id provided."
in e.message
)
request_2 = GenerateKeyRequest(
team_id="litellm-core-infra@gmail.com",
)
key = await generate_key_fn(request_2)
print(key)
generated_key = key.key
asyncio.run(test())
except Exception as e:
print("Got Exception", e)
print(e.message)
pytest.fail(f"An exception occurred - {str(e)}")
def test_call_with_key_over_budget(prisma_client):
# 12. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
try:
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
request = GenerateKeyRequest(max_budget=0.00001)
key = await generate_key_fn(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
from litellm.caching import Cache
litellm.cache = Cache()
import time
request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
resp = ModelResponse(
id=request_id,
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"model": "chatgpt-v-2",
"stream": False,
"litellm_params": {
"metadata": {
"user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00002,
},
completion_response=resp,
start_time=datetime.now(),
end_time=datetime.now(),
)
# test spend_log was written and we can read it
spend_logs = await view_spend_logs(request_id=request_id)
print("read spend logs", spend_logs)
assert len(spend_logs) == 1
spend_log = spend_logs[0]
assert spend_log.request_id == request_id
assert spend_log.spend == float("2e-05")
assert spend_log.model == "chatgpt-v-2"
assert (
spend_log.cache_key
== "a61ae14fe4a8b8014a61e6ae01a100c8bc6770ac37c293242afed954bc69207d"
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
asyncio.run(test())
except Exception as e:
error_detail = e.message
assert "Authentication Error, ExceededTokenBudget:" in error_detail
print(vars(e))
@pytest.mark.asyncio()
async def test_call_with_key_never_over_budget(prisma_client):
# Make a call with a key with budget=None, it should never fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
try:
await litellm.proxy.proxy_server.prisma_client.connect()
request = GenerateKeyRequest(max_budget=None)
key = await generate_key_fn(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
import time
request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
resp = ModelResponse(
id=request_id,
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(
prompt_tokens=210000, completion_tokens=200000, total_tokens=41000
),
)
await track_cost_callback(
kwargs={
"model": "chatgpt-v-2",
"stream": False,
"litellm_params": {
"metadata": {
"user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id,
}
},
"response_cost": 200000,
},
completion_response=resp,
start_time=datetime.now(),
end_time=datetime.now(),
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
except Exception as e:
pytest.fail(f"This should have not failed!. They key uses max_budget=None. {e}")
@pytest.mark.asyncio()
async def test_call_with_key_over_budget_stream(prisma_client):
# 14. Make a call with a key over budget, expect to fail
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
from litellm._logging import verbose_proxy_logger
import logging
litellm.set_verbose = True
verbose_proxy_logger.setLevel(logging.DEBUG)
try:
await litellm.proxy.proxy_server.prisma_client.connect()
request = GenerateKeyRequest(max_budget=0.00001)
key = await generate_key_fn(request)
print(key)
generated_key = key.key
user_id = key.user_id
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
# update spend using track_cost callback, make 2nd request, it should fail
from litellm.proxy.proxy_server import track_cost_callback
from litellm import ModelResponse, Choices, Message, Usage
import time
request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
resp = ModelResponse(
id=request_id,
choices=[
Choices(
finish_reason=None,
index=0,
message=Message(
content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
role="assistant",
),
)
],
model="gpt-35-turbo", # azure always has model written like this
usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
)
await track_cost_callback(
kwargs={
"call_type": "acompletion",
"model": "sagemaker-chatgpt-v-2",
"stream": True,
"complete_streaming_response": resp,
"litellm_params": {
"metadata": {
"user_api_key": hash_token(generated_key),
"user_api_key_user_id": user_id,
}
},
"response_cost": 0.00005,
},
completion_response=resp,
start_time=datetime.now(),
end_time=datetime.now(),
)
# use generated key to auth in
result = await user_api_key_auth(request=request, api_key=bearer_token)
print("result from user auth with new key", result)
pytest.fail(f"This should have failed!. They key crossed it's budget")
except Exception as e:
print("Got Exception", e)
error_detail = e.message
assert "Authentication Error, ExceededTokenBudget:" in error_detail
print(vars(e))
@pytest.mark.asyncio()
async def test_view_spend_per_user(prisma_client):
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
await litellm.proxy.proxy_server.prisma_client.connect()
try:
user_by_spend = await spend_user_fn(user_id=None)
assert type(user_by_spend) == list
assert len(user_by_spend) > 0
first_user = user_by_spend[0]
print("\nfirst_user=", first_user)
assert first_user.spend > 0
except Exception as e:
print("Got Exception", e)
pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
async def test_view_spend_per_key(prisma_client):
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
await litellm.proxy.proxy_server.prisma_client.connect()
try:
key_by_spend = await spend_key_fn()
assert type(key_by_spend) == list
assert len(key_by_spend) > 0
first_key = key_by_spend[0]
print("\nfirst_key=", first_key)
assert first_key.spend > 0
except Exception as e:
print("Got Exception", e)
pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
async def test_key_name_null(prisma_client):
"""
- create key
- get key info
- assert key_name is null
"""
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
await litellm.proxy.proxy_server.prisma_client.connect()
try:
request = GenerateKeyRequest()
key = await generate_key_fn(request)
generated_key = key.key
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert result["info"]["key_name"] is None
except Exception as e:
print("Got Exception", e)
pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
async def test_key_name_set(prisma_client):
"""
- create key
- get key info
- assert key_name is not null
"""
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": True})
await litellm.proxy.proxy_server.prisma_client.connect()
try:
request = GenerateKeyRequest()
key = await generate_key_fn(request)
generated_key = key.key
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert isinstance(result["info"]["key_name"], str)
except Exception as e:
print("Got Exception", e)
pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
async def test_default_key_params(prisma_client):
"""
- create key
- get key info
- assert key_name is not null
"""
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": True})
litellm.default_key_generate_params = {"max_budget": 0.000122}
await litellm.proxy.proxy_server.prisma_client.connect()
try:
request = GenerateKeyRequest()
key = await generate_key_fn(request)
generated_key = key.key
result = await info_key_fn(key=generated_key)
print("result from info_key_fn", result)
assert result["info"]["max_budget"] == 0.000122
except Exception as e:
print("Got Exception", e)
pytest.fail(f"Got exception {e}")

View file

@ -0,0 +1,685 @@
# What this tests?
## Unit Tests for the max parallel request limiter for the proxy
import sys, os, asyncio, time, random
from datetime import datetime
import traceback
from dotenv import load_dotenv
load_dotenv()
import os
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
from litellm import Router
from litellm.proxy.utils import ProxyLogging
from litellm.proxy._types import UserAPIKeyAuth
from litellm.caching import DualCache
from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
from datetime import datetime
## On Request received
## On Request success
## On Request failure
@pytest.mark.asyncio
async def test_pre_call_hook():
"""
Test if cache updated on call being received
"""
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler()
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
print(
parallel_request_handler.user_api_key_cache.get_cache(key=request_count_api_key)
)
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
@pytest.mark.asyncio
async def test_pre_call_hook_rpm_limits():
"""
Test if error raised on hitting rpm limits
"""
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=1
)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler()
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
await parallel_request_handler.async_log_success_event(
kwargs=kwargs,
response_obj="",
start_time="",
end_time="",
)
## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
try:
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
pytest.fail(f"Expected call to fail")
except Exception as e:
assert e.status_code == 429
@pytest.mark.asyncio
async def test_pre_call_hook_tpm_limits():
"""
Test if error raised on hitting tpm limits
"""
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=10
)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler()
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
await parallel_request_handler.async_log_success_event(
kwargs=kwargs,
response_obj=litellm.ModelResponse(usage=litellm.Usage(total_tokens=10)),
start_time="",
end_time="",
)
## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
try:
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
pytest.fail(f"Expected call to fail")
except Exception as e:
assert e.status_code == 429
@pytest.mark.asyncio
async def test_success_call_hook():
"""
Test if on success, cache correctly decremented
"""
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler()
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
await parallel_request_handler.async_log_success_event(
kwargs=kwargs, response_obj="", start_time="", end_time=""
)
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 0
)
@pytest.mark.asyncio
async def test_failure_call_hook():
"""
Test if on failure, cache correctly decremented
"""
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
parallel_request_handler = MaxParallelRequestsHandler()
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
kwargs = {
"litellm_params": {"metadata": {"user_api_key": _api_key}},
"exception": Exception(),
}
await parallel_request_handler.async_log_failure_event(
kwargs=kwargs, response_obj="", start_time="", end_time=""
)
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 0
)
"""
Test with Router
- normal call
- streaming call
- bad call
"""
@pytest.mark.asyncio
async def test_normal_router_call():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# normal call
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
metadata={"user_api_key": _api_key},
)
await asyncio.sleep(1) # success is done in a separate thread
print(f"response: {response}")
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 0
)
@pytest.mark.asyncio
async def test_normal_router_tpm_limit():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=10, tpm_limit=10
)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# normal call
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
metadata={"user_api_key": _api_key},
)
await asyncio.sleep(1) # success is done in a separate thread
print(f"response: {response}")
try:
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
pytest.fail(f"Expected call to fail")
except Exception as e:
assert e.status_code == 429
@pytest.mark.asyncio
async def test_streaming_router_call():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# streaming call
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
stream=True,
metadata={"user_api_key": _api_key},
)
async for chunk in response:
continue
await asyncio.sleep(1) # success is done in a separate thread
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 0
)
@pytest.mark.asyncio
async def test_streaming_router_tpm_limit():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=10, tpm_limit=10
)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# normal call
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
stream=True,
metadata={"user_api_key": _api_key},
)
async for chunk in response:
continue
await asyncio.sleep(1) # success is done in a separate thread
try:
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=local_cache,
data={},
call_type="",
)
pytest.fail(f"Expected call to fail")
except Exception as e:
assert e.status_code == 429
@pytest.mark.asyncio
async def test_bad_router_call():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# bad streaming call
try:
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user2", "content": "Hey, how's it going?"}],
stream=True,
metadata={"user_api_key": _api_key},
)
except:
pass
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 0
)
@pytest.mark.asyncio
async def test_bad_router_tpm_limit():
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
set_verbose=False,
num_retries=3,
) # type: ignore
_api_key = "sk-12345"
user_api_key_dict = UserAPIKeyAuth(
api_key=_api_key, max_parallel_requests=10, tpm_limit=10
)
local_cache = DualCache()
pl = ProxyLogging(user_api_key_cache=local_cache)
pl._init_litellm_callbacks()
print(f"litellm callbacks: {litellm.callbacks}")
parallel_request_handler = pl.max_parallel_request_limiter
await parallel_request_handler.async_pre_call_hook(
user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
)
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_requests"]
== 1
)
# bad call
try:
response = await router.acompletion(
model="azure-model",
messages=[{"role": "user2", "content": "Write me a paragraph on the moon"}],
stream=True,
metadata={"user_api_key": _api_key},
)
except:
pass
await asyncio.sleep(1) # success is done in a separate thread
assert (
parallel_request_handler.user_api_key_cache.get_cache(
key=request_count_api_key
)["current_tpm"]
== 0
)

View file

@ -58,9 +58,10 @@ def test_custom_auth(client):
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
response = client.post("/chat/completions", json=test_data, headers=headers) response = client.post("/chat/completions", json=test_data, headers=headers)
print(f"response: {response.text}") pytest.fail("LiteLLM Proxy test failed. This request should have been rejected")
assert response.status_code == 401
result = response.json()
print(f"Received response: {result}")
except Exception as e: except Exception as e:
pytest.fail("LiteLLM Proxy test failed. Exception", e) print(vars(e))
print("got an exception")
assert e.code == 401
assert e.message == "Authentication Error, Failed custom auth"
pass

View file

@ -32,7 +32,7 @@ from litellm.proxy.proxy_server import (
) # Replace with the actual module where your FastAPI router is defined ) # Replace with the actual module where your FastAPI router is defined
# Your bearer token # Your bearer token
token = "" token = "sk-1234"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}

View file

@ -31,7 +31,7 @@ from litellm.proxy.proxy_server import (
) # Replace with the actual module where your FastAPI router is defined ) # Replace with the actual module where your FastAPI router is defined
# Your bearer token # Your bearer token
token = "" token = "sk-1234"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}

View file

@ -33,7 +33,7 @@ from litellm.proxy.proxy_server import (
) # Replace with the actual module where your FastAPI router is defined ) # Replace with the actual module where your FastAPI router is defined
# Your bearer token # Your bearer token
token = "" token = "sk-1234"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}

View file

@ -942,3 +942,52 @@ def test_reading_openai_keys_os_environ():
# test_reading_openai_keys_os_environ() # test_reading_openai_keys_os_environ()
def test_router_anthropic_key_dynamic():
anthropic_api_key = os.environ.pop("ANTHROPIC_API_KEY")
model_list = [
{
"model_name": "anthropic-claude",
"litellm_params": {
"model": "claude-instant-1",
"api_key": anthropic_api_key,
},
}
]
router = Router(model_list=model_list)
messages = [{"role": "user", "content": "Hey, how's it going?"}]
router.completion(model="anthropic-claude", messages=messages)
os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
def test_router_timeout():
litellm.set_verbose = True
from litellm._logging import verbose_logger
import logging
verbose_logger.setLevel(logging.DEBUG)
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "os.environ/OPENAI_API_KEY",
},
}
]
router = Router(model_list=model_list)
messages = [{"role": "user", "content": "Hey, how's it going?"}]
start_time = time.time()
try:
res = router.completion(
model="gpt-3.5-turbo", messages=messages, timeout=0.0001
)
print(res)
pytest.fail("this should have timed out")
except litellm.exceptions.Timeout as e:
print("got timeout exception")
print(e)
print(vars(e))
pass

View file

@ -10,15 +10,16 @@ import litellm, asyncio, logging
from litellm import Router from litellm import Router
# this tests debug logs from litellm router and litellm proxy server # this tests debug logs from litellm router and litellm proxy server
from litellm._logging import verbose_router_logger from litellm._logging import verbose_router_logger, verbose_logger, verbose_proxy_logger
verbose_router_logger.setLevel(level=logging.INFO)
# this tests debug logs from litellm router and litellm proxy server # this tests debug logs from litellm router and litellm proxy server
def test_async_fallbacks(caplog): def test_async_fallbacks(caplog):
# THIS IS A PROD TEST - DO NOT DELETE THIS. Used for testing if litellm proxy verbose logs are human readable # THIS IS A PROD TEST - DO NOT DELETE THIS. Used for testing if litellm proxy verbose logs are human readable
litellm.set_verbose = False litellm.set_verbose = False
verbose_router_logger.setLevel(level=logging.INFO)
verbose_logger.setLevel(logging.CRITICAL + 1)
verbose_proxy_logger.setLevel(logging.CRITICAL + 1)
model_list = [ model_list = [
{ {
"model_name": "azure/gpt-3.5-turbo", "model_name": "azure/gpt-3.5-turbo",
@ -69,7 +70,10 @@ def test_async_fallbacks(caplog):
# on circle ci the captured logs get some async task exception logs - filter them out # on circle ci the captured logs get some async task exception logs - filter them out
"Task exception was never retrieved" "Task exception was never retrieved"
captured_logs = [ captured_logs = [
log for log in captured_logs if "Task exception was never retrieved" not in log log
for log in captured_logs
if "Task exception was never retrieved" not in log
and "get_available_deployment" not in log
] ]
print("\n Captured caplog records - ", captured_logs) print("\n Captured caplog records - ", captured_logs)

View file

@ -698,3 +698,207 @@ async def test_async_fallbacks_max_retries_per_request():
pytest.fail(f"An exception occurred: {e}") pytest.fail(f"An exception occurred: {e}")
finally: finally:
router.reset() router.reset()
def test_usage_based_routing_fallbacks():
try:
# [Prod Test]
# IT tests Usage Based Routing with fallbacks
# The Request should fail azure/gpt-4-fast. Then fallback -> "azure/gpt-4-basic" -> "openai-gpt-4"
# It should work with "openai-gpt-4"
import os
import litellm
from litellm import Router
from dotenv import load_dotenv
load_dotenv()
# Constants for TPM and RPM allocation
AZURE_FAST_TPM = 3
AZURE_BASIC_TPM = 4
OPENAI_TPM = 400
ANTHROPIC_TPM = 100000
def get_azure_params(deployment_name: str):
params = {
"model": f"azure/{deployment_name}",
"api_key": os.environ["AZURE_API_KEY"],
"api_version": os.environ["AZURE_API_VERSION"],
"api_base": os.environ["AZURE_API_BASE"],
}
return params
def get_openai_params(model: str):
params = {
"model": model,
"api_key": os.environ["OPENAI_API_KEY"],
}
return params
def get_anthropic_params(model: str):
params = {
"model": model,
"api_key": os.environ["ANTHROPIC_API_KEY"],
}
return params
model_list = [
{
"model_name": "azure/gpt-4-fast",
"litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_FAST_TPM,
},
{
"model_name": "azure/gpt-4-basic",
"litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_BASIC_TPM,
},
{
"model_name": "openai-gpt-4",
"litellm_params": get_openai_params("gpt-3.5-turbo"),
"tpm": OPENAI_TPM,
},
{
"model_name": "anthropic-claude-instant-1.2",
"litellm_params": get_anthropic_params("claude-instant-1.2"),
"tpm": ANTHROPIC_TPM,
},
]
# litellm.set_verbose=True
fallbacks_list = [
{"azure/gpt-4-fast": ["azure/gpt-4-basic"]},
{"azure/gpt-4-basic": ["openai-gpt-4"]},
{"openai-gpt-4": ["anthropic-claude-instant-1.2"]},
]
router = Router(
model_list=model_list,
fallbacks=fallbacks_list,
set_verbose=True,
debug_level="DEBUG",
routing_strategy="usage-based-routing",
redis_host=os.environ["REDIS_HOST"],
redis_port=os.environ["REDIS_PORT"],
)
messages = [
{"content": "Tell me a joke.", "role": "user"},
]
response = router.completion(
model="azure/gpt-4-fast",
messages=messages,
timeout=5,
mock_response="very nice to meet you",
)
print("response: ", response)
print("response._hidden_params: ", response._hidden_params)
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
assert response._hidden_params["custom_llm_provider"] == "openai"
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
for i in range(20):
response = router.completion(
model="azure/gpt-4-fast",
messages=messages,
timeout=5,
mock_response="very nice to meet you",
)
print("response: ", response)
print("response._hidden_params: ", response._hidden_params)
if i == 19:
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
assert response._hidden_params["custom_llm_provider"] == "anthropic"
except Exception as e:
pytest.fail(f"An exception occurred {e}")
def test_custom_cooldown_times():
try:
# set, custom_cooldown. Failed model in cooldown_models, after custom_cooldown, the failed model is no longer in cooldown_models
model_list = [
{ # list of model deployments
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 24000000,
},
{ # list of model deployments
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 1,
},
]
litellm.set_verbose = False
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="INFO",
cooldown_time=0.1,
redis_host=os.getenv("REDIS_HOST"),
redis_password=os.getenv("REDIS_PASSWORD"),
redis_port=int(os.getenv("REDIS_PORT")),
)
# make a request - expect it to fail
try:
response = router.completion(
model="gpt-3.5-turbo",
messages=[
{
"content": "Tell me a joke.",
"role": "user",
}
],
)
except:
pass
# expect 1 model to be in cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print("cooldown_deployments after failed call: ", cooldown_deployments)
assert (
len(cooldown_deployments) == 1
), "Expected 1 model to be in cooldown models"
selected_cooldown_model = cooldown_deployments[0]
# wait for 1/2 of cooldown time
time.sleep(router.cooldown_time / 2)
# expect cooldown model to still be in cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print(
"cooldown_deployments after waiting 1/2 of cooldown: ", cooldown_deployments
)
assert (
len(cooldown_deployments) == 1
), "Expected 1 model to be in cooldown models"
# wait for 1/2 of cooldown time again, now we've waited for full cooldown
time.sleep(router.cooldown_time / 2)
# expect cooldown model to be removed from cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print(
"cooldown_deployments after waiting cooldown time: ", cooldown_deployments
)
assert (
len(cooldown_deployments) == 0
), "Expected 0 models to be in cooldown models"
except Exception as e:
print(e)

View file

@ -375,3 +375,76 @@ def test_model_group_aliases():
# test_model_group_aliases() # test_model_group_aliases()
def test_usage_based_routing():
"""
in this test we, have a model group with two models in it, model-a and model-b.
Then at some point, we exceed the TPM limit (set in the litellm_params)
for model-a only; but for model-b we are still under the limit
"""
try:
def get_azure_params(deployment_name: str):
params = {
"model": f"azure/{deployment_name}",
"api_key": os.environ["AZURE_API_KEY"],
"api_version": os.environ["AZURE_API_VERSION"],
"api_base": os.environ["AZURE_API_BASE"],
}
return params
model_list = [
{
"model_name": "azure/gpt-4",
"litellm_params": get_azure_params("chatgpt-low-tpm"),
"tpm": 100,
},
{
"model_name": "azure/gpt-4",
"litellm_params": get_azure_params("chatgpt-high-tpm"),
"tpm": 1000,
},
]
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG",
routing_strategy="usage-based-routing",
redis_host=os.environ["REDIS_HOST"],
redis_port=os.environ["REDIS_PORT"],
)
messages = [
{"content": "Tell me a joke.", "role": "user"},
]
selection_counts = defaultdict(int)
for _ in range(25):
response = router.completion(
model="azure/gpt-4",
messages=messages,
timeout=5,
mock_response="good morning",
)
# print(response)
selection_counts[response["model"]] += 1
print(selection_counts)
total_requests = sum(selection_counts.values())
# Assert that 'chatgpt-low-tpm' has more than 2 requests
assert (
selection_counts["chatgpt-low-tpm"] > 2
), f"Assertion failed: 'chatgpt-low-tpm' does not have more than 2 request in the weighted load balancer. Selection counts {selection_counts}"
# Assert that 'chatgpt-high-tpm' has about 80% of the total requests
assert (
selection_counts["chatgpt-high-tpm"] / total_requests > 0.8
), f"Assertion failed: 'chatgpt-high-tpm' does not have about 80% of the total requests in the weighted load balancer. Selection counts {selection_counts}"
except Exception as e:
pytest.fail(f"Error occurred: {e}")

View file

@ -274,7 +274,7 @@ def test_completion_azure_stream():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
test_completion_azure_stream() # test_completion_azure_stream()
def test_completion_azure_function_calling_stream(): def test_completion_azure_function_calling_stream():
@ -398,6 +398,36 @@ def test_completion_palm_stream():
# test_completion_palm_stream() # test_completion_palm_stream()
def test_completion_gemini_stream():
try:
litellm.set_verbose = False
print("Streaming gemini response")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
print("testing gemini streaming")
response = completion(model="gemini/gemini-pro", messages=messages, stream=True)
print(f"type of response at the top: {response}")
complete_response = ""
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
print(chunk)
# print(chunk.choices[0].delta)
chunk, finished = streaming_format_tests(idx, chunk)
if finished:
break
complete_response += chunk
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_mistral_api_stream(): def test_completion_mistral_api_stream():
try: try:
litellm.set_verbose = True litellm.set_verbose = True
@ -703,8 +733,15 @@ def test_completion_bedrock_claude_stream():
complete_response = "" complete_response = ""
has_finish_reason = False has_finish_reason = False
# Add any assertions here to check the response # Add any assertions here to check the response
first_chunk_id = None
for idx, chunk in enumerate(response): for idx, chunk in enumerate(response):
# print # print
if idx == 0:
first_chunk_id = chunk.id
else:
assert (
chunk.id == first_chunk_id
), f"chunk ids do not match: {chunk.id} != first chunk id{first_chunk_id}"
chunk, finished = streaming_format_tests(idx, chunk) chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished has_finish_reason = finished
complete_response += chunk complete_response += chunk
@ -769,9 +806,30 @@ def test_sagemaker_weird_response():
When the stream ends, flush any remaining holding chunks. When the stream ends, flush any remaining holding chunks.
""" """
try: try:
chunk = """<s>[INST] Hey, how's it going? [/INST] from litellm.llms.sagemaker import TokenIterator
import json
import json
from litellm.llms.sagemaker import TokenIterator
I'm doing well, thanks for asking! How about you? Is there anything you'd like to chat about or ask? I'm here to help with any questions you might have.""" chunk = """<s>[INST] Hey, how's it going? [/INST],
I'm doing well, thanks for asking! How about you? Is there anything you'd like to chat about or ask? I'm here to help with any questions you might have."""
data = "\n".join(
map(
lambda x: f"data: {json.dumps({'token': {'text': x.strip()}})}",
chunk.strip().split(","),
)
)
stream = bytes(data, encoding="utf8")
# Modify the array to be a dictionary with "PayloadPart" and "Bytes" keys.
stream_iterator = iter([{"PayloadPart": {"Bytes": stream}}])
token_iter = TokenIterator(stream_iterator)
# for token in token_iter:
# print(token)
litellm.set_verbose = True
logging_obj = litellm.Logging( logging_obj = litellm.Logging(
model="berri-benchmarking-Llama-2-70b-chat-hf-4", model="berri-benchmarking-Llama-2-70b-chat-hf-4",
@ -783,14 +841,19 @@ def test_sagemaker_weird_response():
start_time=time.time(), start_time=time.time(),
) )
response = litellm.CustomStreamWrapper( response = litellm.CustomStreamWrapper(
completion_stream=chunk, completion_stream=token_iter,
model="berri-benchmarking-Llama-2-70b-chat-hf-4", model="berri-benchmarking-Llama-2-70b-chat-hf-4",
custom_llm_provider="sagemaker", custom_llm_provider="sagemaker",
logging_obj=logging_obj, logging_obj=logging_obj,
) )
complete_response = "" complete_response = ""
for chunk in response: for idx, chunk in enumerate(response):
complete_response += chunk["choices"][0]["delta"]["content"] # print
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
complete_response += chunk
if finished:
break
assert len(complete_response) > 0 assert len(complete_response) > 0
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
@ -813,41 +876,53 @@ async def test_sagemaker_streaming_async():
) )
# Add any assertions here to check the response # Add any assertions here to check the response
print(response)
complete_response = "" complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
idx = 0
async for chunk in response: async for chunk in response:
complete_response += chunk.choices[0].delta.content or "" # print
print(f"complete_response: {complete_response}") chunk, finished = streaming_format_tests(idx, chunk)
assert len(complete_response) > 0 has_finish_reason = finished
complete_response += chunk
if finished:
break
idx += 1
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {str(e)}") pytest.fail(f"An exception occurred - {str(e)}")
# def test_completion_sagemaker_stream(): def test_completion_sagemaker_stream():
# try: try:
# response = completion( response = completion(
# model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
# messages=messages, messages=messages,
# temperature=0.2, temperature=0.2,
# max_tokens=80, max_tokens=80,
# stream=True, stream=True,
# ) )
# complete_response = "" complete_response = ""
# has_finish_reason = False has_finish_reason = False
# # Add any assertions here to check the response # Add any assertions here to check the response
# for idx, chunk in enumerate(response): for idx, chunk in enumerate(response):
# chunk, finished = streaming_format_tests(idx, chunk) chunk, finished = streaming_format_tests(idx, chunk)
# has_finish_reason = finished has_finish_reason = finished
# if finished: if finished:
# break break
# complete_response += chunk complete_response += chunk
# if has_finish_reason is False: if has_finish_reason is False:
# raise Exception("finish reason not set for last chunk") raise Exception("finish reason not set for last chunk")
# if complete_response.strip() == "": if complete_response.strip() == "":
# raise Exception("Empty response received") raise Exception("Empty response received")
# except InvalidRequestError as e: except Exception as e:
# pass pytest.fail(f"Error occurred: {e}")
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_sagemaker_stream() # test_completion_sagemaker_stream()

View file

@ -39,6 +39,8 @@ def test_timeout():
def test_hanging_request_azure(): def test_hanging_request_azure():
litellm.set_verbose = True litellm.set_verbose = True
import asyncio
try: try:
router = litellm.Router( router = litellm.Router(
model_list=[ model_list=[
@ -58,13 +60,20 @@ def test_hanging_request_azure():
) )
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0] encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
response = router.completion(
model="azure-gpt", async def _test():
messages=[{"role": "user", "content": f"what color is red {uuid.uuid4()}"}], response = await router.acompletion(
logit_bias={encoded: 100}, model="azure-gpt",
timeout=0.01, messages=[
) {"role": "user", "content": f"what color is red {uuid.uuid4()}"}
print(response) ],
logit_bias={encoded: 100},
timeout=0.01,
)
print(response)
return response
response = asyncio.run(_test())
if response.choices[0].message.content is not None: if response.choices[0].message.content is not None:
pytest.fail("Got a response, expected a timeout") pytest.fail("Got a response, expected a timeout")

View file

@ -10,6 +10,7 @@
import sys, re, binascii, struct import sys, re, binascii, struct
import litellm import litellm
import dotenv, json, traceback, threading, base64, ast import dotenv, json, traceback, threading, base64, ast
import subprocess, os import subprocess, os
import litellm, openai import litellm, openai
import itertools import itertools
@ -36,6 +37,7 @@ os.environ[
] = filename # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 ] = filename # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
import importlib.metadata import importlib.metadata
from ._logging import verbose_logger
from .integrations.traceloop import TraceloopLogger from .integrations.traceloop import TraceloopLogger
from .integrations.helicone import HeliconeLogger from .integrations.helicone import HeliconeLogger
from .integrations.aispend import AISpendLogger from .integrations.aispend import AISpendLogger
@ -712,6 +714,7 @@ class ImageResponse(OpenAIObject):
############################################################ ############################################################
def print_verbose(print_statement): def print_verbose(print_statement):
try: try:
verbose_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose:
print(print_statement) # noqa print(print_statement) # noqa
except: except:
@ -764,6 +767,7 @@ class Logging:
self.litellm_call_id = litellm_call_id self.litellm_call_id = litellm_call_id
self.function_id = function_id self.function_id = function_id
self.streaming_chunks = [] # for generating complete stream response self.streaming_chunks = [] # for generating complete stream response
self.sync_streaming_chunks = [] # for generating complete stream response
self.model_call_details = {} self.model_call_details = {}
def update_environment_variables( def update_environment_variables(
@ -773,7 +777,7 @@ class Logging:
self.model = model self.model = model
self.user = user self.user = user
self.litellm_params = litellm_params self.litellm_params = litellm_params
self.logger_fn = litellm_params["logger_fn"] self.logger_fn = litellm_params.get("logger_fn", None)
print_verbose(f"self.optional_params: {self.optional_params}") print_verbose(f"self.optional_params: {self.optional_params}")
self.model_call_details = { self.model_call_details = {
"model": self.model, "model": self.model,
@ -827,7 +831,7 @@ class Logging:
[f"-H '{k}: {v}'" for k, v in masked_headers.items()] [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
) )
print_verbose(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}") verbose_logger.debug(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")
curl_command = "\n\nPOST Request Sent from LiteLLM:\n" curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
curl_command += "curl -X POST \\\n" curl_command += "curl -X POST \\\n"
@ -842,7 +846,7 @@ class Logging:
curl_command += additional_args.get("request_str", None) curl_command += additional_args.get("request_str", None)
elif api_base == "": elif api_base == "":
curl_command = self.model_call_details curl_command = self.model_call_details
print_verbose(f"\033[92m{curl_command}\033[0m\n") verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
if self.logger_fn and callable(self.logger_fn): if self.logger_fn and callable(self.logger_fn):
try: try:
self.logger_fn( self.logger_fn(
@ -993,13 +997,10 @@ class Logging:
self.model_call_details["log_event_type"] = "post_api_call" self.model_call_details["log_event_type"] = "post_api_call"
# User Logging -> if you pass in a custom logging function # User Logging -> if you pass in a custom logging function
print_verbose( verbose_logger.debug(
f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n" f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
) )
print_verbose( verbose_logger.debug(
f"Logging Details Post-API Call: logger_fn - {self.logger_fn} | callable(logger_fn) - {callable(self.logger_fn)}"
)
print_verbose(
f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}" f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
) )
if self.logger_fn and callable(self.logger_fn): if self.logger_fn and callable(self.logger_fn):
@ -1065,8 +1066,38 @@ class Logging:
self.model_call_details["log_event_type"] = "successful_api_call" self.model_call_details["log_event_type"] = "successful_api_call"
self.model_call_details["end_time"] = end_time self.model_call_details["end_time"] = end_time
self.model_call_details["cache_hit"] = cache_hit self.model_call_details["cache_hit"] = cache_hit
## if model in model cost map - log the response cost
## else set cost to None
verbose_logger.debug(f"Model={self.model}; result={result}")
if (
result is not None
and (
isinstance(result, ModelResponse)
or isinstance(result, EmbeddingResponse)
)
and self.stream != True
): # handle streaming separately
try:
self.model_call_details["response_cost"] = litellm.completion_cost(
completion_response=result,
)
verbose_logger.debug(
f"Model={self.model}; cost={self.model_call_details['response_cost']}"
)
except litellm.NotFoundError as e:
verbose_logger.debug(
f"Model={self.model} not found in completion cost map."
)
self.model_call_details["response_cost"] = None
else: # streaming chunks + image gen.
self.model_call_details["response_cost"] = None
if litellm.max_budget and self.stream: if (
litellm.max_budget
and self.stream
and result is not None
and "content" in result
):
time_diff = (end_time - start_time).total_seconds() time_diff = (end_time - start_time).total_seconds()
float_diff = float(time_diff) float_diff = float(time_diff)
litellm._current_cost += litellm.completion_cost( litellm._current_cost += litellm.completion_cost(
@ -1078,50 +1109,61 @@ class Logging:
return start_time, end_time, result return start_time, end_time, result
except Exception as e: except Exception as e:
print_verbose(f"[Non-Blocking] LiteLLM.Success_Call Error: {str(e)}") raise Exception(f"[Non-Blocking] LiteLLM.Success_Call Error: {str(e)}")
def success_handler( def success_handler(
self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs
): ):
print_verbose(f"Logging Details LiteLLM-Success Call") verbose_logger.debug(f"Logging Details LiteLLM-Success Call")
start_time, end_time, result = self._success_handler_helper_fn(
start_time=start_time,
end_time=end_time,
result=result,
cache_hit=cache_hit,
)
# print(f"original response in success handler: {self.model_call_details['original_response']}") # print(f"original response in success handler: {self.model_call_details['original_response']}")
try: try:
print_verbose(f"success callbacks: {litellm.success_callback}") verbose_logger.debug(f"success callbacks: {litellm.success_callback}")
## BUILD COMPLETE STREAMED RESPONSE ## BUILD COMPLETE STREAMED RESPONSE
complete_streaming_response = None complete_streaming_response = None
if ( if self.stream:
self.stream
and self.model_call_details.get("litellm_params", {}).get(
"acompletion", False
)
== False
): # only call stream chunk builder if it's not acompletion()
if ( if (
result.choices[0].finish_reason is not None result.choices[0].finish_reason is not None
): # if it's the last chunk ): # if it's the last chunk
self.streaming_chunks.append(result) self.sync_streaming_chunks.append(result)
# print_verbose(f"final set of received chunks: {self.streaming_chunks}") # print_verbose(f"final set of received chunks: {self.sync_streaming_chunks}")
try: try:
complete_streaming_response = litellm.stream_chunk_builder( complete_streaming_response = litellm.stream_chunk_builder(
self.streaming_chunks, self.sync_streaming_chunks,
messages=self.model_call_details.get("messages", None), messages=self.model_call_details.get("messages", None),
start_time=start_time,
end_time=end_time,
) )
except: except:
complete_streaming_response = None complete_streaming_response = None
else: else:
self.streaming_chunks.append(result) self.sync_streaming_chunks.append(result)
if complete_streaming_response: if complete_streaming_response is not None:
verbose_logger.debug(
f"Logging Details LiteLLM-Success Call streaming complete"
)
self.model_call_details[ self.model_call_details[
"complete_streaming_response" "complete_streaming_response"
] = complete_streaming_response ] = complete_streaming_response
try:
self.model_call_details["response_cost"] = litellm.completion_cost(
completion_response=complete_streaming_response,
)
verbose_logger.debug(
f"Model={self.model}; cost={self.model_call_details['response_cost']}"
)
except litellm.NotFoundError as e:
verbose_logger.debug(
f"Model={self.model} not found in completion cost map."
)
self.model_call_details["response_cost"] = None
start_time, end_time, result = self._success_handler_helper_fn(
start_time=start_time,
end_time=end_time,
result=result,
cache_hit=cache_hit,
)
for callback in litellm.success_callback: for callback in litellm.success_callback:
try: try:
if callback == "lite_debugger": if callback == "lite_debugger":
@ -1242,7 +1284,7 @@ class Logging:
) )
if callback == "langfuse": if callback == "langfuse":
global langFuseLogger global langFuseLogger
print_verbose("reaches langfuse for logging!") verbose_logger.debug("reaches langfuse for logging!")
kwargs = {} kwargs = {}
for k, v in self.model_call_details.items(): for k, v in self.model_call_details.items():
if ( if (
@ -1251,7 +1293,10 @@ class Logging:
kwargs[k] = v kwargs[k] = v
# this only logs streaming once, complete_streaming_response exists i.e when stream ends # this only logs streaming once, complete_streaming_response exists i.e when stream ends
if self.stream: if self.stream:
if "complete_streaming_response" not in kwargs: verbose_logger.debug(
f"is complete_streaming_response in kwargs: {kwargs.get('complete_streaming_response', None)}"
)
if complete_streaming_response is None:
break break
else: else:
print_verbose("reaches langfuse for streaming logging!") print_verbose("reaches langfuse for streaming logging!")
@ -1306,7 +1351,9 @@ class Logging:
) )
== False == False
): # custom logger class ): # custom logger class
print_verbose(f"success callbacks: Running Custom Logger Class") verbose_logger.info(
f"success callbacks: Running SYNC Custom Logger Class"
)
if self.stream and complete_streaming_response is None: if self.stream and complete_streaming_response is None:
callback.log_stream_event( callback.log_stream_event(
kwargs=self.model_call_details, kwargs=self.model_call_details,
@ -1328,7 +1375,17 @@ class Logging:
start_time=start_time, start_time=start_time,
end_time=end_time, end_time=end_time,
) )
if callable(callback): # custom logger functions elif (
callable(callback) == True
and self.model_call_details.get("litellm_params", {}).get(
"acompletion", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"aembedding", False
)
== False
): # custom logger functions
print_verbose( print_verbose(
f"success callbacks: Running Custom Callback Function" f"success callbacks: Running Custom Callback Function"
) )
@ -1362,33 +1419,52 @@ class Logging:
""" """
Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions. Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
""" """
print_verbose(f"Async success callbacks: {litellm._async_success_callback}") verbose_logger.debug(
f"Async success callbacks: {litellm._async_success_callback}"
)
start_time, end_time, result = self._success_handler_helper_fn(
start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
)
## BUILD COMPLETE STREAMED RESPONSE ## BUILD COMPLETE STREAMED RESPONSE
complete_streaming_response = None complete_streaming_response = None
if self.stream: if self.stream:
if result.choices[0].finish_reason is not None: # if it's the last chunk if result.choices[0].finish_reason is not None: # if it's the last chunk
self.streaming_chunks.append(result) self.streaming_chunks.append(result)
# print_verbose(f"final set of received chunks: {self.streaming_chunks}") # verbose_logger.debug(f"final set of received chunks: {self.streaming_chunks}")
try: try:
complete_streaming_response = litellm.stream_chunk_builder( complete_streaming_response = litellm.stream_chunk_builder(
self.streaming_chunks, self.streaming_chunks,
messages=self.model_call_details.get("messages", None), messages=self.model_call_details.get("messages", None),
start_time=start_time,
end_time=end_time,
) )
except Exception as e: except Exception as e:
print_verbose( verbose_logger.debug(
f"Error occurred building stream chunk: {traceback.format_exc()}" f"Error occurred building stream chunk: {traceback.format_exc()}"
) )
complete_streaming_response = None complete_streaming_response = None
else: else:
self.streaming_chunks.append(result) self.streaming_chunks.append(result)
if complete_streaming_response: if complete_streaming_response is not None:
print_verbose("Async success callbacks: Got a complete streaming response") verbose_logger.debug(
"Async success callbacks: Got a complete streaming response"
)
self.model_call_details[ self.model_call_details[
"complete_streaming_response" "complete_streaming_response"
] = complete_streaming_response ] = complete_streaming_response
start_time, end_time, result = self._success_handler_helper_fn( try:
start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit self.model_call_details["response_cost"] = litellm.completion_cost(
) completion_response=complete_streaming_response,
)
verbose_logger.debug(
f"Model={self.model}; cost={self.model_call_details['response_cost']}"
)
except litellm.NotFoundError as e:
verbose_logger.debug(
f"Model={self.model} not found in completion cost map."
)
self.model_call_details["response_cost"] = None
for callback in litellm._async_success_callback: for callback in litellm._async_success_callback:
try: try:
if callback == "cache" and litellm.cache is not None: if callback == "cache" and litellm.cache is not None:
@ -1435,15 +1511,27 @@ class Logging:
end_time=end_time, end_time=end_time,
) )
if callable(callback): # custom logger functions if callable(callback): # custom logger functions
print_verbose(f"Async success callbacks: async_log_event") if self.stream:
await customLogger.async_log_event( if "complete_streaming_response" in self.model_call_details:
kwargs=self.model_call_details, await customLogger.async_log_event(
response_obj=result, kwargs=self.model_call_details,
start_time=start_time, response_obj=self.model_call_details[
end_time=end_time, "complete_streaming_response"
print_verbose=print_verbose, ],
callback_func=callback, start_time=start_time,
) end_time=end_time,
print_verbose=print_verbose,
callback_func=callback,
)
else:
await customLogger.async_log_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
print_verbose=print_verbose,
callback_func=callback,
)
if callback == "dynamodb": if callback == "dynamodb":
global dynamoLogger global dynamoLogger
if dynamoLogger is None: if dynamoLogger is None:
@ -1864,12 +1952,6 @@ def client(original_function):
# we only support async s3 logging for acompletion/aembedding since that's used on proxy # we only support async s3 logging for acompletion/aembedding since that's used on proxy
litellm._async_success_callback.append(callback) litellm._async_success_callback.append(callback)
removed_async_items.append(index) removed_async_items.append(index)
elif callback == "langfuse" and inspect.iscoroutinefunction(
original_function
):
# use async success callback for langfuse if this is litellm.acompletion(). Streaming logging does not work otherwise
litellm._async_success_callback.append(callback)
removed_async_items.append(index)
# Pop the async items from success_callback in reverse order to avoid index issues # Pop the async items from success_callback in reverse order to avoid index issues
for index in reversed(removed_async_items): for index in reversed(removed_async_items):
@ -1947,6 +2029,16 @@ def client(original_function):
call_type=call_type, call_type=call_type,
start_time=start_time, start_time=start_time,
) )
## check if metadata is passed in
litellm_params = {}
if "metadata" in kwargs:
litellm_params["metadata"] = kwargs["metadata"]
logging_obj.update_environment_variables(
model=model,
user="",
optional_params={},
litellm_params=litellm_params,
)
return logging_obj return logging_obj
except Exception as e: except Exception as e:
import logging import logging
@ -2098,7 +2190,6 @@ def client(original_function):
result = original_function(*args, **kwargs) result = original_function(*args, **kwargs)
end_time = datetime.datetime.now() end_time = datetime.datetime.now()
if "stream" in kwargs and kwargs["stream"] == True: if "stream" in kwargs and kwargs["stream"] == True:
# TODO: Add to cache for streaming
if ( if (
"complete_response" in kwargs "complete_response" in kwargs
and kwargs["complete_response"] == True and kwargs["complete_response"] == True
@ -2130,7 +2221,7 @@ def client(original_function):
litellm.cache.add_cache(result, *args, **kwargs) litellm.cache.add_cache(result, *args, **kwargs)
# LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated # LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
print_verbose(f"Wrapper: Completed Call, calling success_handler") verbose_logger.info(f"Wrapper: Completed Call, calling success_handler")
threading.Thread( threading.Thread(
target=logging_obj.success_handler, args=(result, start_time, end_time) target=logging_obj.success_handler, args=(result, start_time, end_time)
).start() ).start()
@ -2363,12 +2454,15 @@ def client(original_function):
threading.Thread( threading.Thread(
target=logging_obj.success_handler, args=(result, start_time, end_time) target=logging_obj.success_handler, args=(result, start_time, end_time)
).start() ).start()
# RETURN RESULT # RETURN RESULT
if hasattr(result, "_hidden_params"): if hasattr(result, "_hidden_params"):
result._hidden_params["model_id"] = kwargs.get("model_info", {}).get( result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
"id", None "id", None
) )
if isinstance(result, ModelResponse): if isinstance(result, ModelResponse) or isinstance(
result, EmbeddingResponse
):
result._response_ms = ( result._response_ms = (
end_time - start_time end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai ).total_seconds() * 1000 # return response latency in ms like openai
@ -2486,24 +2580,20 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
def _select_tokenizer(model: str): def _select_tokenizer(model: str):
# cohere from importlib import resources
import pkg_resources
if model in litellm.cohere_models: if model in litellm.cohere_models:
# cohere
tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly") tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# anthropic # anthropic
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models:
# Read the JSON file with resources.open_text(
filename = pkg_resources.resource_filename( "litellm.llms.tokenizers", "anthropic_tokenizer.json"
__name__, "llms/tokenizers/anthropic_tokenizer.json" ) as f:
)
with open(filename, "r") as f:
json_data = json.load(f) json_data = json.load(f)
# Decode the JSON data from utf-8 # Convert to str (if necessary)
json_data_decoded = json.dumps(json_data, ensure_ascii=False) json_str = json.dumps(json_data)
# Convert to str
json_str = str(json_data_decoded)
# load tokenizer # load tokenizer
tokenizer = Tokenizer.from_str(json_str) tokenizer = Tokenizer.from_str(json_str)
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
@ -2793,15 +2883,25 @@ def token_counter(
print_verbose( print_verbose(
f"Token Counter - using generic token counter, for model={model}" f"Token Counter - using generic token counter, for model={model}"
) )
enc = tokenizer_json["tokenizer"].encode(text) num_tokens = openai_token_counter(
num_tokens = len(enc) text=text, # type: ignore
model="gpt-3.5-turbo",
messages=messages,
is_tool_call=is_tool_call,
count_response_tokens=count_response_tokens,
)
else: else:
num_tokens = len(encoding.encode(text)) # type: ignore num_tokens = len(encoding.encode(text)) # type: ignore
return num_tokens return num_tokens
def cost_per_token( def cost_per_token(
model="", prompt_tokens=0, completion_tokens=0, custom_llm_provider=None model="",
prompt_tokens=0,
completion_tokens=0,
response_time_ms=None,
custom_llm_provider=None,
region_name=None,
): ):
""" """
Calculates the cost per token for a given model, prompt tokens, and completion tokens. Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -2818,30 +2918,74 @@ def cost_per_token(
prompt_tokens_cost_usd_dollar = 0 prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0
model_cost_ref = litellm.model_cost model_cost_ref = litellm.model_cost
model_with_provider = model
if custom_llm_provider is not None: if custom_llm_provider is not None:
model_with_provider = custom_llm_provider + "/" + model model_with_provider = custom_llm_provider + "/" + model
else: if region_name is not None:
model_with_provider = model model_with_provider_and_region = (
f"{custom_llm_provider}/{region_name}/{model}"
)
if (
model_with_provider_and_region in model_cost_ref
): # use region based pricing, if it's available
model_with_provider = model_with_provider_and_region
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map") print_verbose(f"Looking up model={model} in model_cost_map")
if model_with_provider in model_cost_ref:
if model in model_cost_ref: print_verbose(
prompt_tokens_cost_usd_dollar = ( f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}"
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
) )
completion_tokens_cost_usd_dollar = ( print_verbose(
model_cost_ref[model]["output_cost_per_token"] * completion_tokens f"applying cost={model_cost_ref[model_with_provider].get('input_cost_per_token', None)} for prompt_tokens={prompt_tokens}"
) )
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model_with_provider in model_cost_ref:
print_verbose(f"Looking up model={model_with_provider} in model_cost_map")
prompt_tokens_cost_usd_dollar = ( prompt_tokens_cost_usd_dollar = (
model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
) )
print_verbose(
f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}"
)
print_verbose(
f"applying cost={model_cost_ref[model_with_provider].get('output_cost_per_token', None)} for completion_tokens={completion_tokens}"
)
completion_tokens_cost_usd_dollar = ( completion_tokens_cost_usd_dollar = (
model_cost_ref[model_with_provider]["output_cost_per_token"] model_cost_ref[model_with_provider]["output_cost_per_token"]
* completion_tokens * completion_tokens
) )
print_verbose(
f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
if model in model_cost_ref:
print_verbose(f"Success: model={model} in model_cost_map")
print_verbose(
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
)
if (
model_cost_ref[model].get("input_cost_per_token", None) is not None
and model_cost_ref[model].get("output_cost_per_token", None) is not None
):
## COST PER TOKEN ##
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
elif (
model_cost_ref[model].get("input_cost_per_second", None) is not None
and response_time_ms is not None
):
print_verbose(
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
)
completion_tokens_cost_usd_dollar = 0.0
print_verbose(
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-3.5-turbo" in model: elif "ft:gpt-3.5-turbo" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
@ -2855,17 +2999,23 @@ def cost_per_token(
) )
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_llms: elif model in litellm.azure_llms:
print_verbose(f"Cost Tracking: {model} is an Azure LLM") verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
model = litellm.azure_llms[model] model = litellm.azure_llms[model]
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
)
prompt_tokens_cost_usd_dollar = ( prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
) )
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
)
completion_tokens_cost_usd_dollar = ( completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens model_cost_ref[model]["output_cost_per_token"] * completion_tokens
) )
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_embedding_models: elif model in litellm.azure_embedding_models:
print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model") verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
model = litellm.azure_embedding_models[model] model = litellm.azure_embedding_models[model]
prompt_tokens_cost_usd_dollar = ( prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
@ -2895,7 +3045,14 @@ def completion_cost(
prompt="", prompt="",
messages: List = [], messages: List = [],
completion="", completion="",
total_time=0.0, # used for replicate total_time=0.0, # used for replicate, sagemaker
### REGION ###
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size=None,
quality=None,
n=None, # number of images
): ):
""" """
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm. Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -2933,15 +3090,20 @@ def completion_cost(
completion_tokens = completion_response.get("usage", {}).get( completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0 "completion_tokens", 0
) )
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = ( model = (
model or completion_response["model"] model or completion_response["model"]
) # check if user passed an override for model, if it's none check completion_response['model'] ) # check if user passed an override for model, if it's none check completion_response['model']
if completion_response is not None and hasattr( if hasattr(completion_response, "_hidden_params"):
completion_response, "_hidden_params"
):
custom_llm_provider = completion_response._hidden_params.get( custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", "" "custom_llm_provider", ""
) )
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
else: else:
if len(messages) > 0: if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages) prompt_tokens = token_counter(model=model, messages=messages)
@ -2953,6 +3115,37 @@ def completion_cost(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
) )
if size is not None and n is not None:
### IMAGE GENERATION COST CALCULATION ###
image_gen_model_name = f"{size}/{model}"
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size = size.split("-x-")
height = int(size[0])
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(f"Model={model} not found in completion cost model map")
# Calculate cost based on prompt_tokens, completion_tokens # Calculate cost based on prompt_tokens, completion_tokens
if "togethercomputer" in model or "together_ai" in model: if "togethercomputer" in model or "together_ai" in model:
# together ai prices based on size of llm # together ai prices based on size of llm
@ -2970,8 +3163,14 @@ def completion_cost(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
response_time_ms=total_time,
region_name=region_name,
) )
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e: except Exception as e:
raise e raise e
@ -3000,9 +3199,8 @@ def register_model(model_cost: Union[str, dict]):
for key, value in loaded_model_cost.items(): for key, value in loaded_model_cost.items():
## override / add new keys to the existing model cost dictionary ## override / add new keys to the existing model cost dictionary
if key in litellm.model_cost: litellm.model_cost.setdefault(key, {}).update(value)
for k, v in loaded_model_cost[key].items(): verbose_logger.debug(f"{key} added to model cost map")
litellm.model_cost[key][k] = v
# add new model names to provider lists # add new model names to provider lists
if value.get("litellm_provider") == "openai": if value.get("litellm_provider") == "openai":
if key not in litellm.open_ai_chat_completion_models: if key not in litellm.open_ai_chat_completion_models:
@ -3138,8 +3336,10 @@ def get_optional_params_image_gen(
def get_optional_params_embeddings( def get_optional_params_embeddings(
# 2 optional params # 2 optional params
model=None,
user=None, user=None,
encoding_format=None, encoding_format=None,
dimensions=None,
custom_llm_provider="", custom_llm_provider="",
**kwargs, **kwargs,
): ):
@ -3150,7 +3350,7 @@ def get_optional_params_embeddings(
for k, v in special_params.items(): for k, v in special_params.items():
passed_params[k] = v passed_params[k] = v
default_params = {"user": None, "encoding_format": None} default_params = {"user": None, "encoding_format": None, "dimensions": None}
non_default_params = { non_default_params = {
k: v k: v
@ -3158,6 +3358,19 @@ def get_optional_params_embeddings(
if (k in default_params and v != default_params[k]) if (k in default_params and v != default_params[k])
} }
## raise exception if non-default value passed for non-openai/azure embedding calls ## raise exception if non-default value passed for non-openai/azure embedding calls
if custom_llm_provider == "openai":
# 'dimensions` is only supported in `text-embedding-3` and later models
if (
model is not None
and "text-embedding-3" not in model
and "dimensions" in non_default_params.keys()
):
raise UnsupportedParamsError(
status_code=500,
message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
)
if ( if (
custom_llm_provider != "openai" custom_llm_provider != "openai"
and custom_llm_provider != "azure" and custom_llm_provider != "azure"
@ -3212,6 +3425,10 @@ def get_optional_params(
custom_llm_provider != "bedrock" and custom_llm_provider != "sagemaker" custom_llm_provider != "bedrock" and custom_llm_provider != "sagemaker"
): # allow dynamically setting boto3 init logic ): # allow dynamically setting boto3 init logic
continue continue
elif (
k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
): # allow dynamically setting vertex ai init logic
continue
passed_params[k] = v passed_params[k] = v
default_params = { default_params = {
"functions": None, "functions": None,
@ -3295,16 +3512,20 @@ def get_optional_params(
) )
def _check_valid_arg(supported_params): def _check_valid_arg(supported_params):
print_verbose( verbose_logger.debug(
f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}" f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
) )
print_verbose(f"\nLiteLLM: Params passed to completion() {passed_params}") verbose_logger.debug(
print_verbose( f"\nLiteLLM: Params passed to completion() {passed_params}"
)
verbose_logger.debug(
f"\nLiteLLM: Non-Default params passed to completion() {non_default_params}" f"\nLiteLLM: Non-Default params passed to completion() {non_default_params}"
) )
unsupported_params = {} unsupported_params = {}
for k in non_default_params.keys(): for k in non_default_params.keys():
if k not in supported_params: if k not in supported_params:
if k == "user":
continue
if k == "n" and n == 1: # langchain sends n=1 as a default value if k == "n" and n == 1: # langchain sends n=1 as a default value
continue # skip this param continue # skip this param
if ( if (
@ -5143,6 +5364,8 @@ def convert_to_model_response_object(
"completion", "embedding", "image_generation" "completion", "embedding", "image_generation"
] = "completion", ] = "completion",
stream=False, stream=False,
start_time=None,
end_time=None,
): ):
try: try:
if response_type == "completion" and ( if response_type == "completion" and (
@ -5196,6 +5419,12 @@ def convert_to_model_response_object(
if "model" in response_object: if "model" in response_object:
model_response_object.model = response_object["model"] model_response_object.model = response_object["model"]
if start_time is not None and end_time is not None:
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000
return model_response_object return model_response_object
elif response_type == "embedding" and ( elif response_type == "embedding" and (
model_response_object is None model_response_object is None
@ -5220,6 +5449,11 @@ def convert_to_model_response_object(
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if start_time is not None and end_time is not None:
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai
return model_response_object return model_response_object
elif response_type == "image_generation" and ( elif response_type == "image_generation" and (
model_response_object is None model_response_object is None
@ -6959,6 +7193,8 @@ class CustomStreamWrapper:
self._hidden_params = { self._hidden_params = {
"model_id": (_model_info.get("id", None)) "model_id": (_model_info.get("id", None))
} # returned as x-litellm-model-id response header in proxy } # returned as x-litellm-model-id response header in proxy
self.response_id = None
self.logging_loop = None
def __iter__(self): def __iter__(self):
return self return self
@ -7280,6 +7516,13 @@ class CustomStreamWrapper:
if str_line.choices[0].finish_reason: if str_line.choices[0].finish_reason:
is_finished = True is_finished = True
finish_reason = str_line.choices[0].finish_reason finish_reason = str_line.choices[0].finish_reason
if finish_reason == "content_filter":
error_message = json.dumps(
str_line.choices[0].content_filter_result
)
raise litellm.AzureOpenAIError(
status_code=400, message=error_message
)
# checking for logprobs # checking for logprobs
if ( if (
@ -7290,16 +7533,6 @@ class CustomStreamWrapper:
else: else:
logprobs = None logprobs = None
if (
hasattr(str_line.choices[0], "content_filter_result")
and str_line.choices[0].content_filter_result is not None
):
error_message = json.dumps(
str_line.choices[0].content_filter_result
)
raise litellm.AzureOpenAIError(
status_code=400, message=error_message
)
return { return {
"text": text, "text": text,
"is_finished": is_finished, "is_finished": is_finished,
@ -7532,9 +7765,35 @@ class CustomStreamWrapper:
} }
return "" return ""
def handle_sagemaker_stream(self, chunk):
if "data: [DONE]" in chunk:
text = ""
is_finished = True
finish_reason = "stop"
return {
"text": text,
"is_finished": is_finished,
"finish_reason": finish_reason,
}
elif isinstance(chunk, dict):
if chunk["is_finished"] == True:
finish_reason = "stop"
else:
finish_reason = ""
return {
"text": chunk["text"],
"is_finished": chunk["is_finished"],
"finish_reason": finish_reason,
}
def chunk_creator(self, chunk): def chunk_creator(self, chunk):
model_response = ModelResponse(stream=True, model=self.model) model_response = ModelResponse(stream=True, model=self.model)
if self.response_id is not None:
model_response.id = self.response_id
else:
self.response_id = model_response.id
model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
model_response._hidden_params["created_at"] = time.time()
model_response.choices = [StreamingChoices()] model_response.choices = [StreamingChoices()]
model_response.choices[0].finish_reason = None model_response.choices[0].finish_reason = None
response_obj = {} response_obj = {}
@ -7616,7 +7875,9 @@ class CustomStreamWrapper:
raise Exception("An unknown error occurred with the stream") raise Exception("An unknown error occurred with the stream")
model_response.choices[0].finish_reason = "stop" model_response.choices[0].finish_reason = "stop"
self.sent_last_chunk = True self.sent_last_chunk = True
elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai": elif self.custom_llm_provider == "gemini":
completion_obj["content"] = chunk.text
elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
try: try:
# print(chunk) # print(chunk)
if hasattr(chunk, "text"): if hasattr(chunk, "text"):
@ -7651,19 +7912,14 @@ class CustomStreamWrapper:
] ]
self.sent_last_chunk = True self.sent_last_chunk = True
elif self.custom_llm_provider == "sagemaker": elif self.custom_llm_provider == "sagemaker":
print_verbose(f"ENTERS SAGEMAKER STREAMING") verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
if len(self.completion_stream) == 0: response_obj = self.handle_sagemaker_stream(chunk)
if self.sent_last_chunk: completion_obj["content"] = response_obj["text"]
raise StopIteration if response_obj["is_finished"]:
else: model_response.choices[0].finish_reason = response_obj[
model_response.choices[0].finish_reason = "stop" "finish_reason"
self.sent_last_chunk = True ]
new_chunk = self.completion_stream self.sent_last_chunk = True
print_verbose(f"sagemaker chunk: {new_chunk}")
completion_obj["content"] = new_chunk
self.completion_stream = self.completion_stream[
len(self.completion_stream) :
]
elif self.custom_llm_provider == "petals": elif self.custom_llm_provider == "petals":
if len(self.completion_stream) == 0: if len(self.completion_stream) == 0:
if self.sent_last_chunk: if self.sent_last_chunk:
@ -7782,7 +8038,7 @@ class CustomStreamWrapper:
completion_obj["role"] = "assistant" completion_obj["role"] = "assistant"
self.sent_first_chunk = True self.sent_first_chunk = True
model_response.choices[0].delta = Delta(**completion_obj) model_response.choices[0].delta = Delta(**completion_obj)
print_verbose(f"model_response: {model_response}") print_verbose(f"returning model_response: {model_response}")
return model_response return model_response
else: else:
return return
@ -7839,6 +8095,27 @@ class CustomStreamWrapper:
original_exception=e, original_exception=e,
) )
def set_logging_event_loop(self, loop):
self.logging_loop = loop
async def your_async_function(self):
# Your asynchronous code here
return "Your asynchronous code is running"
def run_success_logging_in_thread(self, processed_chunk):
# Create an event loop for the new thread
## ASYNC LOGGING
if self.logging_loop is not None:
future = asyncio.run_coroutine_threadsafe(
self.logging_obj.async_success_handler(processed_chunk),
loop=self.logging_loop,
)
result = future.result()
else:
asyncio.run(self.logging_obj.async_success_handler(processed_chunk))
## SYNC LOGGING
self.logging_obj.success_handler(processed_chunk)
## needs to handle the empty string case (even starting chunk can be an empty string) ## needs to handle the empty string case (even starting chunk can be an empty string)
def __next__(self): def __next__(self):
try: try:
@ -7857,8 +8134,9 @@ class CustomStreamWrapper:
continue continue
## LOGGING ## LOGGING
threading.Thread( threading.Thread(
target=self.logging_obj.success_handler, args=(response,) target=self.run_success_logging_in_thread, args=(response,)
).start() # log response ).start() # log response
# RETURN RESULT # RETURN RESULT
return response return response
except StopIteration: except StopIteration:
@ -7914,13 +8192,34 @@ class CustomStreamWrapper:
raise StopAsyncIteration raise StopAsyncIteration
else: # temporary patch for non-aiohttp async calls else: # temporary patch for non-aiohttp async calls
# example - boto3 bedrock llms # example - boto3 bedrock llms
processed_chunk = next(self) while True:
asyncio.create_task( if isinstance(self.completion_stream, str) or isinstance(
self.logging_obj.async_success_handler( self.completion_stream, bytes
processed_chunk, ):
) chunk = self.completion_stream
) else:
return processed_chunk chunk = next(self.completion_stream)
if chunk is not None and chunk != b"":
print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
processed_chunk = self.chunk_creator(chunk=chunk)
print_verbose(
f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
)
if processed_chunk is None:
continue
## LOGGING
threading.Thread(
target=self.logging_obj.success_handler,
args=(processed_chunk,),
).start() # log processed_chunk
asyncio.create_task(
self.logging_obj.async_success_handler(
processed_chunk,
)
)
# RETURN RESULT
return processed_chunk
except StopAsyncIteration: except StopAsyncIteration:
raise raise
except StopIteration: except StopIteration:

View file

@ -62,6 +62,15 @@
"litellm_provider": "openai", "litellm_provider": "openai",
"mode": "chat" "mode": "chat"
}, },
"gpt-4-0125-preview": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00001,
"output_cost_per_token": 0.00003,
"litellm_provider": "openai",
"mode": "chat"
},
"gpt-4-vision-preview": { "gpt-4-vision-preview": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
@ -143,6 +152,20 @@
"litellm_provider": "openai", "litellm_provider": "openai",
"mode": "chat" "mode": "chat"
}, },
"text-embedding-3-large": {
"max_tokens": 8191,
"input_cost_per_token": 0.00000013,
"output_cost_per_token": 0.000000,
"litellm_provider": "openai",
"mode": "embedding"
},
"text-embedding-3-small": {
"max_tokens": 8191,
"input_cost_per_token": 0.00000002,
"output_cost_per_token": 0.000000,
"litellm_provider": "openai",
"mode": "embedding"
},
"text-embedding-ada-002": { "text-embedding-ada-002": {
"max_tokens": 8191, "max_tokens": 8191,
"input_cost_per_token": 0.0000001, "input_cost_per_token": 0.0000001,
@ -906,6 +929,14 @@
"litellm_provider": "bedrock", "litellm_provider": "bedrock",
"mode": "chat" "mode": "chat"
}, },
"amazon.titan-embed-text-v1": {
"max_tokens": 8192,
"output_vector_size": 1536,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0,
"litellm_provider": "bedrock",
"mode": "embedding"
},
"anthropic.claude-v1": { "anthropic.claude-v1": {
"max_tokens": 100000, "max_tokens": 100000,
"max_output_tokens": 8191, "max_output_tokens": 8191,

438
poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]] [[package]]
name = "aiohttp" name = "aiohttp"
@ -169,6 +169,34 @@ doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
trio = ["trio (<0.22)"] trio = ["trio (<0.22)"]
[[package]]
name = "apscheduler"
version = "3.10.4"
description = "In-process task scheduler with Cron-like capabilities"
optional = true
python-versions = ">=3.6"
files = [
{file = "APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661"},
{file = "APScheduler-3.10.4.tar.gz", hash = "sha256:e6df071b27d9be898e486bc7940a7be50b4af2e9da7c08f0744a96d4bd4cef4a"},
]
[package.dependencies]
pytz = "*"
six = ">=1.4.0"
tzlocal = ">=2.0,<3.dev0 || >=4.dev0"
[package.extras]
doc = ["sphinx", "sphinx-rtd-theme"]
gevent = ["gevent"]
mongodb = ["pymongo (>=3.0)"]
redis = ["redis (>=3.0)"]
rethinkdb = ["rethinkdb (>=2.4.0)"]
sqlalchemy = ["sqlalchemy (>=1.4)"]
testing = ["pytest", "pytest-asyncio", "pytest-cov", "pytest-tornado5"]
tornado = ["tornado (>=4.3)"]
twisted = ["twisted"]
zookeeper = ["kazoo"]
[[package]] [[package]]
name = "async-timeout" name = "async-timeout"
version = "4.0.3" version = "4.0.3"
@ -655,20 +683,20 @@ smmap = ">=3.0.1,<6"
[[package]] [[package]]
name = "gitpython" name = "gitpython"
version = "3.1.40" version = "3.1.41"
description = "GitPython is a Python library used to interact with Git repositories" description = "GitPython is a Python library used to interact with Git repositories"
optional = true optional = true
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "GitPython-3.1.40-py3-none-any.whl", hash = "sha256:cf14627d5a8049ffbf49915732e5eddbe8134c3bdb9d476e6182b676fc573f8a"}, {file = "GitPython-3.1.41-py3-none-any.whl", hash = "sha256:c36b6634d069b3f719610175020a9aed919421c87552185b085e04fbbdb10b7c"},
{file = "GitPython-3.1.40.tar.gz", hash = "sha256:22b126e9ffb671fdd0c129796343a02bf67bf2994b35449ffc9321aa755e18a4"}, {file = "GitPython-3.1.41.tar.gz", hash = "sha256:ed66e624884f76df22c8e16066d567aaa5a37d5b5fa19db2c6df6f7156db9048"},
] ]
[package.dependencies] [package.dependencies]
gitdb = ">=4.0.1,<5" gitdb = ">=4.0.1,<5"
[package.extras] [package.extras]
test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-instafail", "pytest-subtests", "pytest-sugar"] test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "sumtypes"]
[[package]] [[package]]
name = "gunicorn" name = "gunicorn"
@ -748,13 +776,13 @@ socks = ["socksio (==1.*)"]
[[package]] [[package]]
name = "huggingface-hub" name = "huggingface-hub"
version = "0.20.1" version = "0.20.2"
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
optional = false optional = false
python-versions = ">=3.8.0" python-versions = ">=3.8.0"
files = [ files = [
{file = "huggingface_hub-0.20.1-py3-none-any.whl", hash = "sha256:ecfdea395a8bc68cd160106c5bd857f7e010768d95f9e1862a779010cc304831"}, {file = "huggingface_hub-0.20.2-py3-none-any.whl", hash = "sha256:53752eda2239d30a470c307a61cf9adcf136bc77b0a734338c7d04941af560d8"},
{file = "huggingface_hub-0.20.1.tar.gz", hash = "sha256:8c88c4c3c8853e22f2dfb4d84c3d493f4e1af52fb3856a90e1eeddcf191ddbb1"}, {file = "huggingface_hub-0.20.2.tar.gz", hash = "sha256:215c5fceff631030c7a3d19ba7b588921c908b3f21eef31d160ebc245b200ff6"},
] ]
[package.dependencies] [package.dependencies]
@ -791,13 +819,13 @@ files = [
[[package]] [[package]]
name = "importlib-metadata" name = "importlib-metadata"
version = "6.11.0" version = "7.0.1"
description = "Read metadata from Python packages" description = "Read metadata from Python packages"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "importlib_metadata-6.11.0-py3-none-any.whl", hash = "sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b"}, {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"},
{file = "importlib_metadata-6.11.0.tar.gz", hash = "sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443"}, {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"},
] ]
[package.dependencies] [package.dependencies]
@ -839,13 +867,13 @@ files = [
[[package]] [[package]]
name = "jinja2" name = "jinja2"
version = "3.1.2" version = "3.1.3"
description = "A very fast and expressive template engine." description = "A very fast and expressive template engine."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
{file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
] ]
[package.dependencies] [package.dependencies]
@ -856,13 +884,13 @@ i18n = ["Babel (>=2.7)"]
[[package]] [[package]]
name = "jsonschema" name = "jsonschema"
version = "4.20.0" version = "4.21.0"
description = "An implementation of JSON Schema validation for Python" description = "An implementation of JSON Schema validation for Python"
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "jsonschema-4.20.0-py3-none-any.whl", hash = "sha256:ed6231f0429ecf966f5bc8dfef245998220549cbbcf140f913b7464c52c3b6b3"}, {file = "jsonschema-4.21.0-py3-none-any.whl", hash = "sha256:70a09719d375c0a2874571b363c8a24be7df8071b80c9aa76bc4551e7297c63c"},
{file = "jsonschema-4.20.0.tar.gz", hash = "sha256:4f614fd46d8d61258610998997743ec5492a648b33cf478c1ddc23ed4598a5fa"}, {file = "jsonschema-4.21.0.tar.gz", hash = "sha256:3ba18e27f7491ea4a1b22edce00fb820eec968d397feb3f9cb61d5894bb38167"},
] ]
[package.dependencies] [package.dependencies]
@ -1130,13 +1158,13 @@ files = [
[[package]] [[package]]
name = "openai" name = "openai"
version = "1.6.1" version = "1.10.0"
description = "The official Python library for the openai API" description = "The official Python library for the openai API"
optional = false optional = false
python-versions = ">=3.7.1" python-versions = ">=3.7.1"
files = [ files = [
{file = "openai-1.6.1-py3-none-any.whl", hash = "sha256:bc9f774838d67ac29fb24cdeb2d58faf57de8b311085dcd1348f7aa02a96c7ee"}, {file = "openai-1.10.0-py3-none-any.whl", hash = "sha256:aa69e97d0223ace9835fbf9c997abe9ee95318f684fd2de6d02c870700c71ebc"},
{file = "openai-1.6.1.tar.gz", hash = "sha256:d553ca9dbf9486b08e75b09e8671e4f638462aaadccfced632bf490fc3d75fa2"}, {file = "openai-1.10.0.tar.gz", hash = "sha256:208886cb501b930dc63f48d51db9c15e5380380f80516d07332adad67c9f1053"},
] ]
[package.dependencies] [package.dependencies]
@ -1301,70 +1329,88 @@ files = [
[[package]] [[package]]
name = "pillow" name = "pillow"
version = "10.1.0" version = "10.2.0"
description = "Python Imaging Library (Fork)" description = "Python Imaging Library (Fork)"
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "Pillow-10.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1ab05f3db77e98f93964697c8efc49c7954b08dd61cff526b7f2531a22410106"}, {file = "pillow-10.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:7823bdd049099efa16e4246bdf15e5a13dbb18a51b68fa06d6c1d4d8b99a796e"},
{file = "Pillow-10.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6932a7652464746fcb484f7fc3618e6503d2066d853f68a4bd97193a3996e273"}, {file = "pillow-10.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:83b2021f2ade7d1ed556bc50a399127d7fb245e725aa0113ebd05cfe88aaf588"},
{file = "Pillow-10.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f63b5a68daedc54c7c3464508d8c12075e56dcfbd42f8c1bf40169061ae666"}, {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fad5ff2f13d69b7e74ce5b4ecd12cc0ec530fcee76356cac6742785ff71c452"},
{file = "Pillow-10.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0949b55eb607898e28eaccb525ab104b2d86542a85c74baf3a6dc24002edec2"}, {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da2b52b37dad6d9ec64e653637a096905b258d2fc2b984c41ae7d08b938a67e4"},
{file = "Pillow-10.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ae88931f93214777c7a3aa0a8f92a683f83ecde27f65a45f95f22d289a69e593"}, {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:47c0995fc4e7f79b5cfcab1fc437ff2890b770440f7696a3ba065ee0fd496563"},
{file = "Pillow-10.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b0eb01ca85b2361b09480784a7931fc648ed8b7836f01fb9241141b968feb1db"}, {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:322bdf3c9b556e9ffb18f93462e5f749d3444ce081290352c6070d014c93feb2"},
{file = "Pillow-10.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d27b5997bdd2eb9fb199982bb7eb6164db0426904020dc38c10203187ae2ff2f"}, {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:51f1a1bffc50e2e9492e87d8e09a17c5eea8409cda8d3f277eb6edc82813c17c"},
{file = "Pillow-10.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7df5608bc38bd37ef585ae9c38c9cd46d7c81498f086915b0f97255ea60c2818"}, {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:69ffdd6120a4737710a9eee73e1d2e37db89b620f702754b8f6e62594471dee0"},
{file = "Pillow-10.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:41f67248d92a5e0a2076d3517d8d4b1e41a97e2df10eb8f93106c89107f38b57"}, {file = "pillow-10.2.0-cp310-cp310-win32.whl", hash = "sha256:c6dafac9e0f2b3c78df97e79af707cdc5ef8e88208d686a4847bab8266870023"},
{file = "Pillow-10.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1fb29c07478e6c06a46b867e43b0bcdb241b44cc52be9bc25ce5944eed4648e7"}, {file = "pillow-10.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:aebb6044806f2e16ecc07b2a2637ee1ef67a11840a66752751714a0d924adf72"},
{file = "Pillow-10.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2cdc65a46e74514ce742c2013cd4a2d12e8553e3a2563c64879f7c7e4d28bce7"}, {file = "pillow-10.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:7049e301399273a0136ff39b84c3678e314f2158f50f517bc50285fb5ec847ad"},
{file = "Pillow-10.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50d08cd0a2ecd2a8657bd3d82c71efd5a58edb04d9308185d66c3a5a5bed9610"}, {file = "pillow-10.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35bb52c37f256f662abdfa49d2dfa6ce5d93281d323a9af377a120e89a9eafb5"},
{file = "Pillow-10.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:062a1610e3bc258bff2328ec43f34244fcec972ee0717200cb1425214fe5b839"}, {file = "pillow-10.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c23f307202661071d94b5e384e1e1dc7dfb972a28a2310e4ee16103e66ddb67"},
{file = "Pillow-10.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:61f1a9d247317fa08a308daaa8ee7b3f760ab1809ca2da14ecc88ae4257d6172"}, {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:773efe0603db30c281521a7c0214cad7836c03b8ccff897beae9b47c0b657d61"},
{file = "Pillow-10.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a646e48de237d860c36e0db37ecaecaa3619e6f3e9d5319e527ccbc8151df061"}, {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11fa2e5984b949b0dd6d7a94d967743d87c577ff0b83392f17cb3990d0d2fd6e"},
{file = "Pillow-10.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:47e5bf85b80abc03be7455c95b6d6e4896a62f6541c1f2ce77a7d2bb832af262"}, {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:716d30ed977be8b37d3ef185fecb9e5a1d62d110dfbdcd1e2a122ab46fddb03f"},
{file = "Pillow-10.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a92386125e9ee90381c3369f57a2a50fa9e6aa8b1cf1d9c4b200d41a7dd8e992"}, {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a086c2af425c5f62a65e12fbf385f7c9fcb8f107d0849dba5839461a129cf311"},
{file = "Pillow-10.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:0f7c276c05a9767e877a0b4c5050c8bee6a6d960d7f0c11ebda6b99746068c2a"}, {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c8de2789052ed501dd829e9cae8d3dcce7acb4777ea4a479c14521c942d395b1"},
{file = "Pillow-10.1.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:a89b8312d51715b510a4fe9fc13686283f376cfd5abca8cd1c65e4c76e21081b"}, {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609448742444d9290fd687940ac0b57fb35e6fd92bdb65386e08e99af60bf757"},
{file = "Pillow-10.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:00f438bb841382b15d7deb9a05cc946ee0f2c352653c7aa659e75e592f6fa17d"}, {file = "pillow-10.2.0-cp311-cp311-win32.whl", hash = "sha256:823ef7a27cf86df6597fa0671066c1b596f69eba53efa3d1e1cb8b30f3533068"},
{file = "Pillow-10.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d929a19f5469b3f4df33a3df2983db070ebb2088a1e145e18facbc28cae5b27"}, {file = "pillow-10.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:1da3b2703afd040cf65ec97efea81cfba59cdbed9c11d8efc5ab09df9509fc56"},
{file = "Pillow-10.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a92109192b360634a4489c0c756364c0c3a2992906752165ecb50544c251312"}, {file = "pillow-10.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:edca80cbfb2b68d7b56930b84a0e45ae1694aeba0541f798e908a49d66b837f1"},
{file = "Pillow-10.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:0248f86b3ea061e67817c47ecbe82c23f9dd5d5226200eb9090b3873d3ca32de"}, {file = "pillow-10.2.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:1b5e1b74d1bd1b78bc3477528919414874748dd363e6272efd5abf7654e68bef"},
{file = "Pillow-10.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9882a7451c680c12f232a422730f986a1fcd808da0fd428f08b671237237d651"}, {file = "pillow-10.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0eae2073305f451d8ecacb5474997c08569fb4eb4ac231ffa4ad7d342fdc25ac"},
{file = "Pillow-10.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1c3ac5423c8c1da5928aa12c6e258921956757d976405e9467c5f39d1d577a4b"}, {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7c2286c23cd350b80d2fc9d424fc797575fb16f854b831d16fd47ceec078f2c"},
{file = "Pillow-10.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:806abdd8249ba3953c33742506fe414880bad78ac25cc9a9b1c6ae97bedd573f"}, {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e23412b5c41e58cec602f1135c57dfcf15482013ce6e5f093a86db69646a5aa"},
{file = "Pillow-10.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:eaed6977fa73408b7b8a24e8b14e59e1668cfc0f4c40193ea7ced8e210adf996"}, {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:52a50aa3fb3acb9cf7213573ef55d31d6eca37f5709c69e6858fe3bc04a5c2a2"},
{file = "Pillow-10.1.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:fe1e26e1ffc38be097f0ba1d0d07fcade2bcfd1d023cda5b29935ae8052bd793"}, {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:127cee571038f252a552760076407f9cff79761c3d436a12af6000cd182a9d04"},
{file = "Pillow-10.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7a7e3daa202beb61821c06d2517428e8e7c1aab08943e92ec9e5755c2fc9ba5e"}, {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8d12251f02d69d8310b046e82572ed486685c38f02176bd08baf216746eb947f"},
{file = "Pillow-10.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24fadc71218ad2b8ffe437b54876c9382b4a29e030a05a9879f615091f42ffc2"}, {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:54f1852cd531aa981bc0965b7d609f5f6cc8ce8c41b1139f6ed6b3c54ab82bfb"},
{file = "Pillow-10.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa1d323703cfdac2036af05191b969b910d8f115cf53093125e4058f62012c9a"}, {file = "pillow-10.2.0-cp312-cp312-win32.whl", hash = "sha256:257d8788df5ca62c980314053197f4d46eefedf4e6175bc9412f14412ec4ea2f"},
{file = "Pillow-10.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:912e3812a1dbbc834da2b32299b124b5ddcb664ed354916fd1ed6f193f0e2d01"}, {file = "pillow-10.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:154e939c5f0053a383de4fd3d3da48d9427a7e985f58af8e94d0b3c9fcfcf4f9"},
{file = "Pillow-10.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:7dbaa3c7de82ef37e7708521be41db5565004258ca76945ad74a8e998c30af8d"}, {file = "pillow-10.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:f379abd2f1e3dddb2b61bc67977a6b5a0a3f7485538bcc6f39ec76163891ee48"},
{file = "Pillow-10.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9d7bc666bd8c5a4225e7ac71f2f9d12466ec555e89092728ea0f5c0c2422ea80"}, {file = "pillow-10.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8373c6c251f7ef8bda6675dd6d2b3a0fcc31edf1201266b5cf608b62a37407f9"},
{file = "Pillow-10.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baada14941c83079bf84c037e2d8b7506ce201e92e3d2fa0d1303507a8538212"}, {file = "pillow-10.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:870ea1ada0899fd0b79643990809323b389d4d1d46c192f97342eeb6ee0b8483"},
{file = "Pillow-10.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:2ef6721c97894a7aa77723740a09547197533146fba8355e86d6d9a4a1056b14"}, {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4b6b1e20608493548b1f32bce8cca185bf0480983890403d3b8753e44077129"},
{file = "Pillow-10.1.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0a026c188be3b443916179f5d04548092e253beb0c3e2ee0a4e2cdad72f66099"}, {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3031709084b6e7852d00479fd1d310b07d0ba82765f973b543c8af5061cf990e"},
{file = "Pillow-10.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04f6f6149f266a100374ca3cc368b67fb27c4af9f1cc8cb6306d849dcdf12616"}, {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:3ff074fc97dd4e80543a3e91f69d58889baf2002b6be64347ea8cf5533188213"},
{file = "Pillow-10.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb40c011447712d2e19cc261c82655f75f32cb724788df315ed992a4d65696bb"}, {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:cb4c38abeef13c61d6916f264d4845fab99d7b711be96c326b84df9e3e0ff62d"},
{file = "Pillow-10.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a8413794b4ad9719346cd9306118450b7b00d9a15846451549314a58ac42219"}, {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b1b3020d90c2d8e1dae29cf3ce54f8094f7938460fb5ce8bc5c01450b01fbaf6"},
{file = "Pillow-10.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c9aeea7b63edb7884b031a35305629a7593272b54f429a9869a4f63a1bf04c34"}, {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:170aeb00224ab3dc54230c797f8404507240dd868cf52066f66a41b33169bdbe"},
{file = "Pillow-10.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b4005fee46ed9be0b8fb42be0c20e79411533d1fd58edabebc0dd24626882cfd"}, {file = "pillow-10.2.0-cp38-cp38-win32.whl", hash = "sha256:c4225f5220f46b2fde568c74fca27ae9771536c2e29d7c04f4fb62c83275ac4e"},
{file = "Pillow-10.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4d0152565c6aa6ebbfb1e5d8624140a440f2b99bf7afaafbdbf6430426497f28"}, {file = "pillow-10.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:0689b5a8c5288bc0504d9fcee48f61a6a586b9b98514d7d29b840143d6734f39"},
{file = "Pillow-10.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d921bc90b1defa55c9917ca6b6b71430e4286fc9e44c55ead78ca1a9f9eba5f2"}, {file = "pillow-10.2.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:b792a349405fbc0163190fde0dc7b3fef3c9268292586cf5645598b48e63dc67"},
{file = "Pillow-10.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cfe96560c6ce2f4c07d6647af2d0f3c54cc33289894ebd88cfbb3bcd5391e256"}, {file = "pillow-10.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c570f24be1e468e3f0ce7ef56a89a60f0e05b30a3669a459e419c6eac2c35364"},
{file = "Pillow-10.1.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:937bdc5a7f5343d1c97dc98149a0be7eb9704e937fe3dc7140e229ae4fc572a7"}, {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ecd059fdaf60c1963c58ceb8997b32e9dc1b911f5da5307aab614f1ce5c2fb"},
{file = "Pillow-10.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1c25762197144e211efb5f4e8ad656f36c8d214d390585d1d21281f46d556ba"}, {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c365fd1703040de1ec284b176d6af5abe21b427cb3a5ff68e0759e1e313a5e7e"},
{file = "Pillow-10.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:afc8eef765d948543a4775f00b7b8c079b3321d6b675dde0d02afa2ee23000b4"}, {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:70c61d4c475835a19b3a5aa42492409878bbca7438554a1f89d20d58a7c75c01"},
{file = "Pillow-10.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:883f216eac8712b83a63f41b76ddfb7b2afab1b74abbb413c5df6680f071a6b9"}, {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b6f491cdf80ae540738859d9766783e3b3c8e5bd37f5dfa0b76abdecc5081f13"},
{file = "Pillow-10.1.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:b920e4d028f6442bea9a75b7491c063f0b9a3972520731ed26c83e254302eb1e"}, {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d189550615b4948f45252d7f005e53c2040cea1af5b60d6f79491a6e147eef7"},
{file = "Pillow-10.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c41d960babf951e01a49c9746f92c5a7e0d939d1652d7ba30f6b3090f27e412"}, {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:49d9ba1ed0ef3e061088cd1e7538a0759aab559e2e0a80a36f9fd9d8c0c21591"},
{file = "Pillow-10.1.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1fafabe50a6977ac70dfe829b2d5735fd54e190ab55259ec8aea4aaea412fa0b"}, {file = "pillow-10.2.0-cp39-cp39-win32.whl", hash = "sha256:babf5acfede515f176833ed6028754cbcd0d206f7f614ea3447d67c33be12516"},
{file = "Pillow-10.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3b834f4b16173e5b92ab6566f0473bfb09f939ba14b23b8da1f54fa63e4b623f"}, {file = "pillow-10.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:0304004f8067386b477d20a518b50f3fa658a28d44e4116970abfcd94fac34a8"},
{file = "Pillow-10.1.0.tar.gz", hash = "sha256:e6bf8de6c36ed96c86ea3b6e1d5273c53f46ef518a062464cd7ef5dd2cf92e38"}, {file = "pillow-10.2.0-cp39-cp39-win_arm64.whl", hash = "sha256:0fb3e7fc88a14eacd303e90481ad983fd5b69c761e9e6ef94c983f91025da869"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:322209c642aabdd6207517e9739c704dc9f9db943015535783239022002f054a"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3eedd52442c0a5ff4f887fab0c1c0bb164d8635b32c894bc1faf4c618dd89df2"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb28c753fd5eb3dd859b4ee95de66cc62af91bcff5db5f2571d32a520baf1f04"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:33870dc4653c5017bf4c8873e5488d8f8d5f8935e2f1fb9a2208c47cdd66efd2"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:3c31822339516fb3c82d03f30e22b1d038da87ef27b6a78c9549888f8ceda39a"},
{file = "pillow-10.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a2b56ba36e05f973d450582fb015594aaa78834fefe8dfb8fcd79b93e64ba4c6"},
{file = "pillow-10.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d8e6aeb9201e655354b3ad049cb77d19813ad4ece0df1249d3c793de3774f8c7"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:2247178effb34a77c11c0e8ac355c7a741ceca0a732b27bf11e747bbc950722f"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15587643b9e5eb26c48e49a7b33659790d28f190fc514a322d55da2fb5c2950e"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753cd8f2086b2b80180d9b3010dd4ed147efc167c90d3bf593fe2af21265e5a5"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7c8f97e8e7a9009bcacbe3766a36175056c12f9a44e6e6f2d5caad06dcfbf03b"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d1b35bcd6c5543b9cb547dee3150c93008f8dd0f1fef78fc0cd2b141c5baf58a"},
{file = "pillow-10.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fe4c15f6c9285dc54ce6553a3ce908ed37c8f3825b5a51a15c91442bb955b868"},
{file = "pillow-10.2.0.tar.gz", hash = "sha256:e87f0b2c78157e12d7686b27d63c070fd65d994e8ddae6f328e0dcf4a0cd007e"},
] ]
[package.extras] [package.extras]
docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
fpx = ["olefile"]
mic = ["olefile"]
tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
typing = ["typing-extensions"]
xmp = ["defusedxml"]
[[package]] [[package]]
name = "pkgutil-resolve-name" name = "pkgutil-resolve-name"
@ -1409,22 +1455,22 @@ testing = ["pytest", "pytest-benchmark"]
[[package]] [[package]]
name = "protobuf" name = "protobuf"
version = "4.25.1" version = "4.25.2"
description = "" description = ""
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "protobuf-4.25.1-cp310-abi3-win32.whl", hash = "sha256:193f50a6ab78a970c9b4f148e7c750cfde64f59815e86f686c22e26b4fe01ce7"}, {file = "protobuf-4.25.2-cp310-abi3-win32.whl", hash = "sha256:b50c949608682b12efb0b2717f53256f03636af5f60ac0c1d900df6213910fd6"},
{file = "protobuf-4.25.1-cp310-abi3-win_amd64.whl", hash = "sha256:3497c1af9f2526962f09329fd61a36566305e6c72da2590ae0d7d1322818843b"}, {file = "protobuf-4.25.2-cp310-abi3-win_amd64.whl", hash = "sha256:8f62574857ee1de9f770baf04dde4165e30b15ad97ba03ceac65f760ff018ac9"},
{file = "protobuf-4.25.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:0bf384e75b92c42830c0a679b0cd4d6e2b36ae0cf3dbb1e1dfdda48a244f4bcd"}, {file = "protobuf-4.25.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:2db9f8fa64fbdcdc93767d3cf81e0f2aef176284071507e3ede160811502fd3d"},
{file = "protobuf-4.25.1-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:0f881b589ff449bf0b931a711926e9ddaad3b35089cc039ce1af50b21a4ae8cb"}, {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:10894a2885b7175d3984f2be8d9850712c57d5e7587a2410720af8be56cdaf62"},
{file = "protobuf-4.25.1-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:ca37bf6a6d0046272c152eea90d2e4ef34593aaa32e8873fc14c16440f22d4b7"}, {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fc381d1dd0516343f1440019cedf08a7405f791cd49eef4ae1ea06520bc1c020"},
{file = "protobuf-4.25.1-cp38-cp38-win32.whl", hash = "sha256:abc0525ae2689a8000837729eef7883b9391cd6aa7950249dcf5a4ede230d5dd"}, {file = "protobuf-4.25.2-cp38-cp38-win32.whl", hash = "sha256:33a1aeef4b1927431d1be780e87b641e322b88d654203a9e9d93f218ee359e61"},
{file = "protobuf-4.25.1-cp38-cp38-win_amd64.whl", hash = "sha256:1484f9e692091450e7edf418c939e15bfc8fc68856e36ce399aed6889dae8bb0"}, {file = "protobuf-4.25.2-cp38-cp38-win_amd64.whl", hash = "sha256:47f3de503fe7c1245f6f03bea7e8d3ec11c6c4a2ea9ef910e3221c8a15516d62"},
{file = "protobuf-4.25.1-cp39-cp39-win32.whl", hash = "sha256:8bdbeaddaac52d15c6dce38c71b03038ef7772b977847eb6d374fc86636fa510"}, {file = "protobuf-4.25.2-cp39-cp39-win32.whl", hash = "sha256:5e5c933b4c30a988b52e0b7c02641760a5ba046edc5e43d3b94a74c9fc57c1b3"},
{file = "protobuf-4.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:becc576b7e6b553d22cbdf418686ee4daa443d7217999125c045ad56322dda10"}, {file = "protobuf-4.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:d66a769b8d687df9024f2985d5137a337f957a0916cf5464d1513eee96a63ff0"},
{file = "protobuf-4.25.1-py3-none-any.whl", hash = "sha256:a19731d5e83ae4737bb2a089605e636077ac001d18781b3cf489b9546c7c80d6"}, {file = "protobuf-4.25.2-py3-none-any.whl", hash = "sha256:a8b7a98d4ce823303145bf3c1a8bdb0f2f4642a414b196f04ad9853ed0c8f830"},
{file = "protobuf-4.25.1.tar.gz", hash = "sha256:57d65074b4f5baa4ab5da1605c02be90ac20c8b40fb137d6a8df9f416b0d0ce2"}, {file = "protobuf-4.25.2.tar.gz", hash = "sha256:fe599e175cb347efc8ee524bcd4b902d11f7262c0e569ececcb89995c15f0a5e"},
] ]
[[package]] [[package]]
@ -1807,13 +1853,13 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"
[[package]] [[package]]
name = "referencing" name = "referencing"
version = "0.32.0" version = "0.32.1"
description = "JSON Referencing + Python" description = "JSON Referencing + Python"
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "referencing-0.32.0-py3-none-any.whl", hash = "sha256:bdcd3efb936f82ff86f993093f6da7435c7de69a3b3a5a06678a6050184bee99"}, {file = "referencing-0.32.1-py3-none-any.whl", hash = "sha256:7e4dc12271d8e15612bfe35792f5ea1c40970dadf8624602e33db2758f7ee554"},
{file = "referencing-0.32.0.tar.gz", hash = "sha256:689e64fe121843dcfd57b71933318ef1f91188ffb45367332700a86ac8fd6161"}, {file = "referencing-0.32.1.tar.gz", hash = "sha256:3c57da0513e9563eb7e203ebe9bb3a1b509b042016433bd1e45a2853466c3dd3"},
] ]
[package.dependencies] [package.dependencies]
@ -1964,110 +2010,110 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]] [[package]]
name = "rpds-py" name = "rpds-py"
version = "0.16.2" version = "0.17.1"
description = "Python bindings to Rust's persistent data structures (rpds)" description = "Python bindings to Rust's persistent data structures (rpds)"
optional = true optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "rpds_py-0.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:509b617ac787cd1149600e731db9274ebbef094503ca25158e6f23edaba1ca8f"}, {file = "rpds_py-0.17.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4128980a14ed805e1b91a7ed551250282a8ddf8201a4e9f8f5b7e6225f54170d"},
{file = "rpds_py-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:413b9c17388bbd0d87a329d8e30c1a4c6e44e2bb25457f43725a8e6fe4161e9e"}, {file = "rpds_py-0.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ff1dcb8e8bc2261a088821b2595ef031c91d499a0c1b031c152d43fe0a6ecec8"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2946b120718eba9af2b4dd103affc1164a87b9e9ebff8c3e4c05d7b7a7e274e2"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d65e6b4f1443048eb7e833c2accb4fa7ee67cc7d54f31b4f0555b474758bee55"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:35ae5ece284cf36464eb160880018cf6088a9ac5ddc72292a6092b6ef3f4da53"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a71169d505af63bb4d20d23a8fbd4c6ce272e7bce6cc31f617152aa784436f29"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc6a7620ba7639a3db6213da61312cb4aa9ac0ca6e00dc1cbbdc21c2aa6eb57"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:436474f17733c7dca0fbf096d36ae65277e8645039df12a0fa52445ca494729d"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8cb6fe8ecdfffa0e711a75c931fb39f4ba382b4b3ccedeca43f18693864fe850"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10162fe3f5f47c37ebf6d8ff5a2368508fe22007e3077bf25b9c7d803454d921"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dace7b26a13353e24613417ce2239491b40a6ad44e5776a18eaff7733488b44"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:720215373a280f78a1814becb1312d4e4d1077b1202a56d2b0815e95ccb99ce9"},
{file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1bdbc5fcb04a7309074de6b67fa9bc4b418ab3fc435fec1f2779a0eced688d04"}, {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:70fcc6c2906cfa5c6a552ba7ae2ce64b6c32f437d8f3f8eea49925b278a61453"},
{file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f42e25c016927e2a6b1ce748112c3ab134261fc2ddc867e92d02006103e1b1b7"}, {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91e5a8200e65aaac342a791272c564dffcf1281abd635d304d6c4e6b495f29dc"},
{file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:eab36eae3f3e8e24b05748ec9acc66286662f5d25c52ad70cadab544e034536b"}, {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:99f567dae93e10be2daaa896e07513dd4bf9c2ecf0576e0533ac36ba3b1d5394"},
{file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0474df4ade9a3b4af96c3d36eb81856cb9462e4c6657d4caecfd840d2a13f3c9"}, {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24e4900a6643f87058a27320f81336d527ccfe503984528edde4bb660c8c8d59"},
{file = "rpds_py-0.16.2-cp310-none-win32.whl", hash = "sha256:84c5a4d1f9dd7e2d2c44097fb09fffe728629bad31eb56caf97719e55575aa82"}, {file = "rpds_py-0.17.1-cp310-none-win32.whl", hash = "sha256:0bfb09bf41fe7c51413f563373e5f537eaa653d7adc4830399d4e9bdc199959d"},
{file = "rpds_py-0.16.2-cp310-none-win_amd64.whl", hash = "sha256:2bd82db36cd70b3628c0c57d81d2438e8dd4b7b32a6a9f25f24ab0e657cb6c4e"}, {file = "rpds_py-0.17.1-cp310-none-win_amd64.whl", hash = "sha256:20de7b7179e2031a04042e85dc463a93a82bc177eeba5ddd13ff746325558aa6"},
{file = "rpds_py-0.16.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:adc0c3d6fc6ae35fee3e4917628983f6ce630d513cbaad575b4517d47e81b4bb"}, {file = "rpds_py-0.17.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:65dcf105c1943cba45d19207ef51b8bc46d232a381e94dd38719d52d3980015b"},
{file = "rpds_py-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec23fcad480e77ede06cf4127a25fc440f7489922e17fc058f426b5256ee0edb"}, {file = "rpds_py-0.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:01f58a7306b64e0a4fe042047dd2b7d411ee82e54240284bab63e325762c1147"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07aab64e2808c3ebac2a44f67e9dc0543812b715126dfd6fe4264df527556cb6"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:071bc28c589b86bc6351a339114fb7a029f5cddbaca34103aa573eba7b482382"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4ebb8b20bd09c5ce7884c8f0388801100f5e75e7f733b1b6613c713371feefc"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae35e8e6801c5ab071b992cb2da958eee76340e6926ec693b5ff7d6381441745"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3d7e2ea25d3517c6d7e5a1cc3702cffa6bd18d9ef8d08d9af6717fc1c700eed"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149c5cd24f729e3567b56e1795f74577aa3126c14c11e457bec1b1c90d212e38"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f28ac0e8e7242d140f99402a903a2c596ab71550272ae9247ad78f9a932b5698"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e796051f2070f47230c745d0a77a91088fbee2cc0502e9b796b9c6471983718c"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19f00f57fdd38db4bb5ad09f9ead1b535332dbf624200e9029a45f1f35527ebb"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e820ee1004327609b28db8307acc27f5f2e9a0b185b2064c5f23e815f248f8"},
{file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3da5a4c56953bdbf6d04447c3410309616c54433146ccdb4a277b9cb499bc10e"}, {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1957a2ab607f9added64478a6982742eb29f109d89d065fa44e01691a20fc20a"},
{file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec2e1cf025b2c0f48ec17ff3e642661da7ee332d326f2e6619366ce8e221f018"}, {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8587fd64c2a91c33cdc39d0cebdaf30e79491cc029a37fcd458ba863f8815383"},
{file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e0441fb4fdd39a230477b2ca9be90868af64425bfe7b122b57e61e45737a653b"}, {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4dc889a9d8a34758d0fcc9ac86adb97bab3fb7f0c4d29794357eb147536483fd"},
{file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9f0350ef2fba5f34eb0c9000ea328e51b9572b403d2f7f3b19f24085f6f598e8"}, {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2953937f83820376b5979318840f3ee47477d94c17b940fe31d9458d79ae7eea"},
{file = "rpds_py-0.16.2-cp311-none-win32.whl", hash = "sha256:5a80e2f83391ad0808b4646732af2a7b67550b98f0cae056cb3b40622a83dbb3"}, {file = "rpds_py-0.17.1-cp311-none-win32.whl", hash = "sha256:1bfcad3109c1e5ba3cbe2f421614e70439f72897515a96c462ea657261b96518"},
{file = "rpds_py-0.16.2-cp311-none-win_amd64.whl", hash = "sha256:e04e56b4ca7a770593633556e8e9e46579d66ec2ada846b401252a2bdcf70a6d"}, {file = "rpds_py-0.17.1-cp311-none-win_amd64.whl", hash = "sha256:99da0a4686ada4ed0f778120a0ea8d066de1a0a92ab0d13ae68492a437db78bf"},
{file = "rpds_py-0.16.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5e6caa3809e50690bd92fa490f5c38caa86082c8c3315aa438bce43786d5e90d"}, {file = "rpds_py-0.17.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1dc29db3900cb1bb40353772417800f29c3d078dbc8024fd64655a04ee3c4bdf"},
{file = "rpds_py-0.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e53b9b25cac9065328901713a7e9e3b12e4f57ef4280b370fbbf6fef2052eef"}, {file = "rpds_py-0.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82ada4a8ed9e82e443fcef87e22a3eed3654dd3adf6e3b3a0deb70f03e86142a"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af27423662f32d7501a00c5e7342f7dbd1e4a718aea7a239781357d15d437133"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d36b2b59e8cc6e576f8f7b671e32f2ff43153f0ad6d0201250a7c07f25d570e"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:43d4dd5fb16eb3825742bad8339d454054261ab59fed2fbac84e1d84d5aae7ba"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3677fcca7fb728c86a78660c7fb1b07b69b281964673f486ae72860e13f512ad"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e061de3b745fe611e23cd7318aec2c8b0e4153939c25c9202a5811ca911fd733"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:516fb8c77805159e97a689e2f1c80655c7658f5af601c34ffdb916605598cda2"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b811d182ad17ea294f2ec63c0621e7be92a1141e1012383461872cead87468f"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df3b6f45ba4515632c5064e35ca7f31d51d13d1479673185ba8f9fefbbed58b9"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5552f328eaef1a75ff129d4d0c437bf44e43f9436d3996e8eab623ea0f5fcf73"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a967dd6afda7715d911c25a6ba1517975acd8d1092b2f326718725461a3d33f9"},
{file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dcbe1f8dd179e4d69b70b1f1d9bb6fd1e7e1bdc9c9aad345cdeb332e29d40748"}, {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dbbb95e6fc91ea3102505d111b327004d1c4ce98d56a4a02e82cd451f9f57140"},
{file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8aad80645a011abae487d356e0ceb359f4938dfb6f7bcc410027ed7ae4f7bb8b"}, {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:02866e060219514940342a1f84303a1ef7a1dad0ac311792fbbe19b521b489d2"},
{file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6f5549d6ed1da9bfe3631ca9483ae906f21410be2445b73443fa9f017601c6f"}, {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2528ff96d09f12e638695f3a2e0c609c7b84c6df7c5ae9bfeb9252b6fa686253"},
{file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d452817e0d9c749c431a1121d56a777bd7099b720b3d1c820f1725cb40928f58"}, {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd345a13ce06e94c753dab52f8e71e5252aec1e4f8022d24d56decd31e1b9b23"},
{file = "rpds_py-0.16.2-cp312-none-win32.whl", hash = "sha256:888a97002e986eca10d8546e3c8b97da1d47ad8b69726dcfeb3e56348ebb28a3"}, {file = "rpds_py-0.17.1-cp312-none-win32.whl", hash = "sha256:2a792b2e1d3038daa83fa474d559acfd6dc1e3650ee93b2662ddc17dbff20ad1"},
{file = "rpds_py-0.16.2-cp312-none-win_amd64.whl", hash = "sha256:d8dda2a806dfa4a9b795950c4f5cc56d6d6159f7d68080aedaff3bdc9b5032f5"}, {file = "rpds_py-0.17.1-cp312-none-win_amd64.whl", hash = "sha256:292f7344a3301802e7c25c53792fae7d1593cb0e50964e7bcdcc5cf533d634e3"},
{file = "rpds_py-0.16.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:071980663c273bf3d388fe5c794c547e6f35ba3335477072c713a3176bf14a60"}, {file = "rpds_py-0.17.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:8ffe53e1d8ef2520ebcf0c9fec15bb721da59e8ef283b6ff3079613b1e30513d"},
{file = "rpds_py-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:726ac36e8a3bb8daef2fd482534cabc5e17334052447008405daca7ca04a3108"}, {file = "rpds_py-0.17.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4341bd7579611cf50e7b20bb8c2e23512a3dc79de987a1f411cb458ab670eb90"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9e557db6a177470316c82f023e5d571811c9a4422b5ea084c85da9aa3c035fc"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4eb548daf4836e3b2c662033bfbfc551db58d30fd8fe660314f86bf8510b93"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:90123853fc8b1747f80b0d354be3d122b4365a93e50fc3aacc9fb4c2488845d6"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b686f25377f9c006acbac63f61614416a6317133ab7fafe5de5f7dc8a06d42eb"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a61f659665a39a4d17d699ab3593d7116d66e1e2e3f03ef3fb8f484e91908808"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4e21b76075c01d65d0f0f34302b5a7457d95721d5e0667aea65e5bb3ab415c25"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc97f0640e91d7776530f06e6836c546c1c752a52de158720c4224c9e8053cad"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b86b21b348f7e5485fae740d845c65a880f5d1eda1e063bc59bef92d1f7d0c55"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a54e99a2b9693a37ebf245937fd6e9228b4cbd64b9cc961e1f3391ec6c7391"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f175e95a197f6a4059b50757a3dca33b32b61691bdbd22c29e8a8d21d3914cae"},
{file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd4b677d929cf1f6bac07ad76e0f2d5de367e6373351c01a9c0a39f6b21b4a8b"}, {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1701fc54460ae2e5efc1dd6350eafd7a760f516df8dbe51d4a1c79d69472fbd4"},
{file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:5ef00873303d678aaf8b0627e111fd434925ca01c657dbb2641410f1cdaef261"}, {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9051e3d2af8f55b42061603e29e744724cb5f65b128a491446cc029b3e2ea896"},
{file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:349cb40897fd529ca15317c22c0eab67f5ac5178b5bd2c6adc86172045210acc"}, {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7450dbd659fed6dd41d1a7d47ed767e893ba402af8ae664c157c255ec6067fde"},
{file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2ddef620e70eaffebed5932ce754d539c0930f676aae6212f8e16cd9743dd365"}, {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:5a024fa96d541fd7edaa0e9d904601c6445e95a729a2900c5aec6555fe921ed6"},
{file = "rpds_py-0.16.2-cp38-none-win32.whl", hash = "sha256:882ce6e25e585949c3d9f9abd29202367175e0aab3aba0c58c9abbb37d4982ff"}, {file = "rpds_py-0.17.1-cp38-none-win32.whl", hash = "sha256:da1ead63368c04a9bded7904757dfcae01eba0e0f9bc41d3d7f57ebf1c04015a"},
{file = "rpds_py-0.16.2-cp38-none-win_amd64.whl", hash = "sha256:f4bd4578e44f26997e9e56c96dedc5f1af43cc9d16c4daa29c771a00b2a26851"}, {file = "rpds_py-0.17.1-cp38-none-win_amd64.whl", hash = "sha256:841320e1841bb53fada91c9725e766bb25009cfd4144e92298db296fb6c894fb"},
{file = "rpds_py-0.16.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:69ac7ea9897ec201ce68b48582f3eb34a3f9924488a5432a93f177bf76a82a7e"}, {file = "rpds_py-0.17.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:f6c43b6f97209e370124baf2bf40bb1e8edc25311a158867eb1c3a5d449ebc7a"},
{file = "rpds_py-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a9880b4656efe36ccad41edc66789e191e5ee19a1ea8811e0aed6f69851a82f4"}, {file = "rpds_py-0.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7d63ec01fe7c76c2dbb7e972fece45acbb8836e72682bde138e7e039906e2c"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee94cb58c0ba2c62ee108c2b7c9131b2c66a29e82746e8fa3aa1a1effbd3dcf1"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81038ff87a4e04c22e1d81f947c6ac46f122e0c80460b9006e6517c4d842a6ec"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24f7a2eb3866a9e91f4599851e0c8d39878a470044875c49bd528d2b9b88361c"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:810685321f4a304b2b55577c915bece4c4a06dfe38f6e62d9cc1d6ca8ee86b99"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca57468da2d9a660bcf8961637c85f2fbb2aa64d9bc3f9484e30c3f9f67b1dd7"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25f071737dae674ca8937a73d0f43f5a52e92c2d178330b4c0bb6ab05586ffa6"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccd4e400309e1f34a5095bf9249d371f0fd60f8a3a5c4a791cad7b99ce1fd38d"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa5bfb13f1e89151ade0eb812f7b0d7a4d643406caaad65ce1cbabe0a66d695f"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80443fe2f7b3ea3934c5d75fb0e04a5dbb4a8e943e5ff2de0dec059202b70a8b"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfe07308b311a8293a0d5ef4e61411c5c20f682db6b5e73de6c7c8824272c256"},
{file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4d6a9f052e72d493efd92a77f861e45bab2f6be63e37fa8ecf0c6fd1a58fedb0"}, {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a000133a90eea274a6f28adc3084643263b1e7c1a5a66eb0a0a7a36aa757ed74"},
{file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:35953f4f2b3216421af86fd236b7c0c65935936a94ea83ddbd4904ba60757773"}, {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d0e8a6434a3fbf77d11448c9c25b2f25244226cfbec1a5159947cac5b8c5fa4"},
{file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:981d135c7cdaf6cd8eadae1c950de43b976de8f09d8e800feed307140d3d6d00"}, {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:efa767c220d94aa4ac3a6dd3aeb986e9f229eaf5bce92d8b1b3018d06bed3772"},
{file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d0dd7ed2f16df2e129496e7fbe59a34bc2d7fc8db443a606644d069eb69cbd45"}, {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:dbc56680ecf585a384fbd93cd42bc82668b77cb525343170a2d86dafaed2a84b"},
{file = "rpds_py-0.16.2-cp39-none-win32.whl", hash = "sha256:703d95c75a72e902544fda08e965885525e297578317989fd15a6ce58414b41d"}, {file = "rpds_py-0.17.1-cp39-none-win32.whl", hash = "sha256:270987bc22e7e5a962b1094953ae901395e8c1e1e83ad016c5cfcfff75a15a3f"},
{file = "rpds_py-0.16.2-cp39-none-win_amd64.whl", hash = "sha256:e93ec1b300acf89730cf27975ef574396bc04edecc358e9bd116fb387a123239"}, {file = "rpds_py-0.17.1-cp39-none-win_amd64.whl", hash = "sha256:2a7b2f2f56a16a6d62e55354dd329d929560442bd92e87397b7a9586a32e3e76"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:44627b6ca7308680a70766454db5249105fa6344853af6762eaad4158a2feebe"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3264e3e858de4fc601741498215835ff324ff2482fd4e4af61b46512dd7fc83"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3f91df8e6dbb7360e176d1affd5fb0246d2b88d16aa5ebc7db94fd66b68b61da"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:f2f3b28b40fddcb6c1f1f6c88c6f3769cd933fa493ceb79da45968a21dccc920"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d904c5693e08bad240f16d79305edba78276be87061c872a4a15e2c301fa2c0"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9584f8f52010295a4a417221861df9bea4c72d9632562b6e59b3c7b87a1522b7"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:290a81cfbe4673285cdf140ec5cd1658ffbf63ab359f2b352ebe172e7cfa5bf0"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c64602e8be701c6cfe42064b71c84ce62ce66ddc6422c15463fd8127db3d8066"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b634c5ec0103c5cbebc24ebac4872b045cccb9456fc59efdcf6fe39775365bd2"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:060f412230d5f19fc8c8b75f315931b408d8ebf56aec33ef4168d1b9e54200b1"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a297a4d08cc67c7466c873c78039d87840fb50d05473db0ec1b7b03d179bf322"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9412abdf0ba70faa6e2ee6c0cc62a8defb772e78860cef419865917d86c7342"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2e75e17bd0bb66ee34a707da677e47c14ee51ccef78ed6a263a4cc965a072a1"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9737bdaa0ad33d34c0efc718741abaafce62fadae72c8b251df9b0c823c63b22"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1b9d9260e06ea017feb7172976ab261e011c1dc2f8883c7c274f6b2aabfe01a"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9f0e4dc0f17dcea4ab9d13ac5c666b6b5337042b4d8f27e01b70fae41dd65c57"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:162d7cd9cd311c1b0ff1c55a024b8f38bd8aad1876b648821da08adc40e95734"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1db228102ab9d1ff4c64148c96320d0be7044fa28bd865a9ce628ce98da5973d"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:9b32f742ce5b57201305f19c2ef7a184b52f6f9ba6871cc042c2a61f0d6b49b8"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:d8bbd8e56f3ba25a7d0cf980fc42b34028848a53a0e36c9918550e0280b9d0b6"},
{file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac08472f41ea77cd6a5dae36ae7d4ed3951d6602833af87532b556c1b4601d63"}, {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:be22ae34d68544df293152b7e50895ba70d2a833ad9566932d750d3625918b82"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:495a14b72bbe217f2695dcd9b5ab14d4f8066a00f5d209ed94f0aca307f85f6e"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf046179d011e6114daf12a534d874958b039342b347348a78b7cdf0dd9d6041"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:8d6b6937ae9eac6d6c0ca3c42774d89fa311f55adff3970fb364b34abde6ed3d"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:1a746a6d49665058a5896000e8d9d2f1a6acba8a03b389c1e4c06e11e0b7f40d"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a61226465bda9283686db8f17d02569a98e4b13c637be5a26d44aa1f1e361c2"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b8bf5b8db49d8fd40f54772a1dcf262e8be0ad2ab0206b5a2ec109c176c0a4"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5cf6af100ffb5c195beec11ffaa8cf8523057f123afa2944e6571d54da84cdc9"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f7f4cb1f173385e8a39c29510dd11a78bf44e360fb75610594973f5ea141028b"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df15846ee3fb2e6397fe25d7ca6624af9f89587f3f259d177b556fed6bebe2c"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7fbd70cb8b54fe745301921b0816c08b6d917593429dfc437fd024b5ba713c58"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1be2f033df1b8be8c3167ba3c29d5dca425592ee31e35eac52050623afba5772"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bdf1303df671179eaf2cb41e8515a07fc78d9d00f111eadbe3e14262f59c3d0"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96f957d6ab25a78b9e7fc9749d754b98eac825a112b4e666525ce89afcbd9ed5"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fad059a4bd14c45776600d223ec194e77db6c20255578bb5bcdd7c18fd169361"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:088396c7c70e59872f67462fcac3ecbded5233385797021976a09ebd55961dfe"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3664d126d3388a887db44c2e293f87d500c4184ec43d5d14d2d2babdb4c64cad"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4c46ad6356e1561f2a54f08367d1d2e70a0a1bb2db2282d2c1972c1d38eafc3b"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:698ea95a60c8b16b58be9d854c9f993c639f5c214cf9ba782eca53a8789d6b19"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:47713dc4fce213f5c74ca8a1f6a59b622fc1b90868deb8e8e4d993e421b4b39d"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:c3d2010656999b63e628a3c694f23020322b4178c450dc478558a2b6ef3cb9bb"},
{file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:f811771019f063bbd0aa7bb72c8a934bc13ebacb4672d712fc1639cfd314cccc"}, {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:938eab7323a736533f015e6069a7d53ef2dcc841e4e533b782c2bfb9fb12d84b"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f19afcfc0dd0dca35694df441e9b0f95bc231b512f51bded3c3d8ca32153ec19"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1e626b365293a2142a62b9a614e1f8e331b28f3ca57b9f05ebbf4cf2a0f0bdc5"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4b682c5775d6a3d21e314c10124599976809455ee67020e8e72df1769b87bc3"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:380e0df2e9d5d5d339803cfc6d183a5442ad7ab3c63c2a0982e8c824566c5ccc"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c647ca87fc0ebe808a41de912e9a1bfef9acb85257e5d63691364ac16b81c1f0"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b760a56e080a826c2e5af09002c1a037382ed21d03134eb6294812dda268c811"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:302bd4983bbd47063e452c38be66153760112f6d3635c7eeefc094299fa400a9"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5576ee2f3a309d2bb403ec292d5958ce03953b0e57a11d224c1f134feaf8c40f"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf721ede3eb7b829e4a9b8142bd55db0bdc82902720548a703f7e601ee13bdc3"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3c3461ebb4c4f1bbc70b15d20b565759f97a5aaf13af811fcefc892e9197ba"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:358dafc89ce3894c7f486c615ba914609f38277ef67f566abc4c854d23b997fa"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:637b802f3f069a64436d432117a7e58fab414b4e27a7e81049817ae94de45d8d"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cad0f59ee3dc35526039f4bc23642d52d5f6616b5f687d846bfc6d0d6d486db0"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffee088ea9b593cc6160518ba9bd319b5475e5f3e578e4552d63818773c6f56a"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cffa76b385dfe1e38527662a302b19ffb0e7f5cf7dd5e89186d2c94a22dd9d0c"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3ac732390d529d8469b831949c78085b034bff67f584559340008d0f6041a049"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:83640a5d7cd3bff694747d50436b8b541b5b9b9782b0c8c1688931d6ee1a1f2d"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:93432e747fb07fa567ad9cc7aaadd6e29710e515aabf939dfbed8046041346c6"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:ed99b4f7179d2111702020fd7d156e88acd533f5a7d3971353e568b6051d5c97"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b7d9ca34542099b4e185b3c2a2b2eda2e318a7dbde0b0d83357a6d4421b5296"},
{file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:4022b9dc620e14f30201a8a73898a873c8e910cb642bcd2f3411123bc527f6ac"}, {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:0387ce69ba06e43df54e43968090f3626e231e4bc9150e4c3246947567695f68"},
{file = "rpds_py-0.16.2.tar.gz", hash = "sha256:781ef8bfc091b19960fc0142a23aedadafa826bc32b433fdfe6fd7f964d7ef44"}, {file = "rpds_py-0.17.1.tar.gz", hash = "sha256:0210b2668f24c078307260bf88bdac9d6f1093635df5123789bfee4d8d7fc8e7"},
] ]
[[package]] [[package]]
@ -2138,13 +2184,13 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam
[[package]] [[package]]
name = "streamlit" name = "streamlit"
version = "1.29.0" version = "1.30.0"
description = "A faster way to build and share data apps" description = "A faster way to build and share data apps"
optional = true optional = true
python-versions = ">=3.8, !=3.9.7" python-versions = ">=3.8, !=3.9.7"
files = [ files = [
{file = "streamlit-1.29.0-py2.py3-none-any.whl", hash = "sha256:753510edb5bb831af0e3bdacd353c879ad5b4f0211e7efa0ec378809464868b4"}, {file = "streamlit-1.30.0-py2.py3-none-any.whl", hash = "sha256:536494a4edfe9b66ed70c437176cfd6c7e36b1d99d0587b0be64245fa89c241b"},
{file = "streamlit-1.29.0.tar.gz", hash = "sha256:b6dfff9c5e132e5518c92150efcd452980db492a45fafeac3d4688d2334efa07"}, {file = "streamlit-1.30.0.tar.gz", hash = "sha256:90333915d9df8ce3b06de31b8a5bbab51e8cf0982dc6c32da9d6b1f2b4a9fa78"},
] ]
[package.dependencies] [package.dependencies]
@ -2153,7 +2199,7 @@ blinker = ">=1.0.0,<2"
cachetools = ">=4.0,<6" cachetools = ">=4.0,<6"
click = ">=7.0,<9" click = ">=7.0,<9"
gitpython = ">=3.0.7,<3.1.19 || >3.1.19,<4" gitpython = ">=3.0.7,<3.1.19 || >3.1.19,<4"
importlib-metadata = ">=1.4,<7" importlib-metadata = ">=1.4,<8"
numpy = ">=1.19.3,<2" numpy = ">=1.19.3,<2"
packaging = ">=16.8,<24" packaging = ">=16.8,<24"
pandas = ">=1.3.0,<3" pandas = ">=1.3.0,<3"
@ -2684,9 +2730,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[extras] [extras]
extra-proxy = ["streamlit"] extra-proxy = ["streamlit"]
proxy = ["backoff", "fastapi", "gunicorn", "orjson", "pyyaml", "rq", "uvicorn"] proxy = ["apscheduler", "backoff", "fastapi", "gunicorn", "orjson", "pyyaml", "rq", "uvicorn"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<3.9.7 || >3.9.7" python-versions = ">=3.8.1,<3.9.7 || >3.9.7"
content-hash = "f4d60cb3f552af0d2a4e4ef5c6f55696fd6e546b75ff7b4ec362c3549a63c92a" content-hash = "19f79f119f1760d3406b446fa3664b82c0d0859b3912dcb7ba7c8edf1d786096"

View file

@ -1,10 +1,20 @@
model_list: model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: gpt-4 - model_name: gpt-4
litellm_params: litellm_params:
model: azure/chatgpt-v-2 model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15" api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: sagemaker-completion-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_second: 0.000420
- model_name: gpt-4 - model_name: gpt-4
litellm_params: litellm_params:
model: azure/gpt-turbo model: azure/gpt-turbo
@ -17,11 +27,26 @@ model_list:
api_key: os.environ/AZURE_EUROPE_API_KEY api_key: os.environ/AZURE_EUROPE_API_KEY
api_base: https://my-endpoint-europe-berri-992.openai.azure.com api_base: https://my-endpoint-europe-berri-992.openai.azure.com
rpm: 10 rpm: 10
- model_name: text-embedding-ada-002
litellm_params:
model: azure/azure-embedding-model
api_key: os.environ/AZURE_API_KEY
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
model_info:
mode: embedding
base_model: text-embedding-ada-002
- model_name: dall-e-2
litellm_params:
model: azure/
api_version: 2023-06-01-preview
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_key: os.environ/AZURE_API_KEY
litellm_settings: litellm_settings:
drop_params: True drop_params: True
set_verbose: True max_budget: 100
budget_duration: 30d
general_settings: general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy

View file

@ -1,9 +1,9 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.18.0" version = "1.20.5"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT License" license = "MIT"
readme = "README.md" readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
@ -22,9 +22,10 @@ uvicorn = {version = "^0.22.0", optional = true}
gunicorn = {version = "^21.2.0", optional = true} gunicorn = {version = "^21.2.0", optional = true}
fastapi = {version = "^0.104.1", optional = true} fastapi = {version = "^0.104.1", optional = true}
backoff = {version = "*", optional = true} backoff = {version = "*", optional = true}
pyyaml = {version = "^6.0", optional = true} pyyaml = {version = "^6.0.1", optional = true}
rq = {version = "*", optional = true} rq = {version = "*", optional = true}
orjson = {version = "^3.9.7", optional = true} orjson = {version = "^3.9.7", optional = true}
apscheduler = {version = "^3.10.4", optional = true}
streamlit = {version = "^1.29.0", optional = true} streamlit = {version = "^1.29.0", optional = true}
[tool.poetry.extras] [tool.poetry.extras]
@ -36,6 +37,7 @@ proxy = [
"pyyaml", "pyyaml",
"rq", "rq",
"orjson", "orjson",
"apscheduler"
] ]
extra_proxy = [ extra_proxy = [
@ -61,7 +63,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.18.0" version = "1.20.5"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

Some files were not shown because too many files have changed in this diff Show more