Merge branch 'main' into litellm_http_proxy_support

This commit is contained in:
Krish Dholakia 2024-02-01 09:18:50 -08:00 committed by GitHub
commit 058813da76
199 changed files with 18866 additions and 1341 deletions

View file

@ -42,6 +42,7 @@ jobs:
pip install "anyio==3.7.1" pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "apscheduler==3.10.4"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
- save_cache: - save_cache:
paths: paths:
@ -97,6 +98,43 @@ jobs:
command: | command: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install openai
python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install "google-generativeai>=0.3.2"
pip install "google-cloud-aiplatform>=1.38.0"
pip install "boto3>=1.28.57"
pip install langchain
pip install "langfuse>=2.0.0"
pip install numpydoc
pip install prisma
pip install "httpx==0.24.1"
pip install "gunicorn==21.2.0"
pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
# Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
command: docker build -t my-app:latest -f Dockerfile.database . command: docker build -t my-app:latest -f Dockerfile.database .
@ -106,15 +144,20 @@ jobs:
docker run -d \ docker run -d \
-p 4000:4000 \ -p 4000:4000 \
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \ -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
-e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_API_KEY=$AZURE_API_KEY \
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \ -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
--name my-app \ --name my-app \
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \ -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
my-app:latest \ my-app:latest \
--config /app/config.yaml \ --config /app/config.yaml \
--port 4000 \ --port 4000 \
--num_workers 8 --num_workers 8 \
--detailed_debug \
--run_gunicorn \
- run: - run:
name: Install curl and dockerize name: Install curl and dockerize
command: | command: |
@ -125,63 +168,22 @@ jobs:
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run: - run:
name: Start outputting logs name: Start outputting logs
command: | command: docker logs -f my-app
while true; do
docker logs my-app
sleep 10
done
background: true background: true
- run: - run:
name: Wait for app to be ready name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m command: dockerize -wait http://localhost:4000 -timeout 1m
- run: - run:
name: Test the application name: Run tests
command: | command: |
mkdir -p /tmp/responses pwd
for i in {1..10}; do ls
status_file="/tmp/responses/status_${i}.txt" python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
response_file="/tmp/responses/response_${i}.json" no_output_timeout: 120m
(curl --location --request POST 'http://0.0.0.0:4000/key/generate' \ # Store test results
--header 'Authorization: Bearer sk-1234' \ - store_test_results:
--header 'Content-Type: application/json' \ path: test-results
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
--silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
# Capture PIDs of background processes
pids[${i}]=$!
done
# Wait for all background processes to finish
for pid in ${pids[*]}; do
wait $pid
done
# Check all responses and status codes
fail=false
for i in {1..10}; do
status=$(cat "/tmp/responses/status_${i}.txt")
# Here, we need to set the correct response file path for each iteration
response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
response=$(cat "${response_file}")
echo "Response ${i} (Status code: ${status}):"
echo "${response}" # Use echo here to print the contents
echo # Additional newline for readability
if [ "$status" -ne 200 ]; then
echo "A request did not return a 200 status code: $status"
fail=true
fi
done
# If any request did not return status code 200, fail the job
if [ "$fail" = true ]; then
exit 1
fi
echo "All requests returned a 200 status code."
publish_to_pypi: publish_to_pypi:
docker: docker:

View file

@ -41,6 +41,7 @@ jobs:
push: true push: true
file: Dockerfile.database file: Dockerfile.database
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }} tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
build-and-push-image: build-and-push-image:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
@ -74,7 +75,9 @@ jobs:
push: true push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest' tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
build-and-push-image-alpine: platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-ui:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
contents: read contents: read
@ -90,20 +93,21 @@ jobs:
username: ${{ github.actor }} username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Alpine Dockerfile - name: Extract metadata (tags, labels) for UI Dockerfile
id: meta-alpine id: meta-ui
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
- name: Build and push Alpine Docker image - name: Build and push UI Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with: with:
context: . context: ui/
file: Dockerfile.alpine file: ui/Dockerfile
push: true push: true
tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
labels: ${{ steps.meta-alpine.outputs.labels }} labels: ${{ steps.meta-ui.outputs.labels }}
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-database: build-and-push-image-database:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
@ -168,3 +172,14 @@ jobs:
} catch (error) { } catch (error) {
core.setFailed(error.message); core.setFailed(error.message);
} }
- name: Github Releases To Discord
uses: SethCohen/github-releases-to-discord@v1.13.1
with:
webhook_url: ${{ secrets.WEBHOOK_URL }}
color: "2105893"
username: "Release Changelog"
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
content: "||@everyone||"
footer_title: "Changelog"
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
footer_timestamp: true

5
.gitignore vendored
View file

@ -35,3 +35,8 @@ hosted_config.yaml
litellm/proxy/tests/node_modules litellm/proxy/tests/node_modules
litellm/proxy/tests/package.json litellm/proxy/tests/package.json
litellm/proxy/tests/package-lock.json litellm/proxy/tests/package-lock.json
ui/litellm-dashboard/.next
ui/litellm-dashboard/node_modules
ui/litellm-dashboard/next-env.d.ts
ui/litellm-dashboard/package.json
ui/litellm-dashboard/package-lock.json

View file

@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
EXPOSE 4000/tcp EXPOSE 4000/tcp
ENTRYPOINT ["litellm"] ENTRYPOINT ["litellm"]
CMD ["--port", "4000"] CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]

View file

@ -56,4 +56,4 @@ EXPOSE 4000/tcp
# # Set your entrypoint and command # # Set your entrypoint and command
ENTRYPOINT ["litellm"] ENTRYPOINT ["litellm"]
CMD ["--port", "4000"] CMD ["--port", "4000", "--run_gunicorn"]

View file

@ -0,0 +1,34 @@
import os
from openai import OpenAI
from dotenv import load_dotenv
import httpx
import concurrent.futures
load_dotenv()
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
def create_chat_completion():
return client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test. Respond in 20 lines",
}
],
model="gpt-3.5-turbo",
)
with concurrent.futures.ThreadPoolExecutor() as executor:
# Set a timeout of 10 seconds
future = executor.submit(create_chat_completion)
try:
chat_completion = future.result(timeout=0.00001)
print(chat_completion)
except concurrent.futures.TimeoutError:
print("Operation timed out.")

View file

@ -0,0 +1,61 @@
# Notes - on how to do sagemaker streaming using boto3
import json
import boto3
import sys, os
import traceback
from dotenv import load_dotenv
load_dotenv()
import os, io
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
def __iter__(self):
return self
def __next__(self):
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
return line_data["token"]["text"]
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
payload = {
"inputs": "How do I build a website?",
"parameters": {"max_new_tokens": 256},
"stream": True,
}
import boto3
client = boto3.client("sagemaker-runtime", region_name="us-west-2")
response = client.invoke_endpoint_with_response_stream(
EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
Body=json.dumps(payload),
ContentType="application/json",
)
# for token in TokenIterator(response["Body"]):
# print(token)

View file

@ -1,12 +0,0 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any

15
docker-compose.yml Normal file
View file

@ -0,0 +1,15 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main-latest
volumes:
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
ports:
- "4000:4000"
environment:
- AZURE_API_KEY=sk-123
litellm-ui:
image: ghcr.io/berriai/litellm-ui:main-latest

View file

@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'` - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. - `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
``` ```python
input=["good morning from litellm"] input=["good morning from litellm"]
``` ```
@ -22,7 +22,11 @@ input=["good morning from litellm"]
- `user`: *string (optional)* A unique identifier representing your end-user, - `user`: *string (optional)* A unique identifier representing your end-user,
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes). - `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `api_base`: *string (optional)* - The api endpoint you want to call the model with - `api_base`: *string (optional)* - The api endpoint you want to call the model with
@ -66,11 +70,18 @@ input=["good morning from litellm"]
from litellm import embedding from litellm import embedding
import os import os
os.environ['OPENAI_API_KEY'] = "" os.environ['OPENAI_API_KEY'] = ""
response = embedding('text-embedding-ada-002', input=["good morning from litellm"]) response = embedding(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
metadata={"anything": "good day"},
dimensions=5 # Only supported in text-embedding-3 and later models.
)
``` ```
| Model Name | Function Call | Required OS Variables | | Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------| |----------------------|---------------------------------------------|--------------------------------------|
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` | | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Embedding Models ## Azure OpenAI Embedding Models

View file

@ -28,6 +28,8 @@ import litellm
import os import os
os.environ["LANGSMITH_API_KEY"] = "" os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
# LLM API Keys # LLM API Keys
os.environ['OPENAI_API_KEY']="" os.environ['OPENAI_API_KEY']=""

View file

@ -6,7 +6,7 @@
# Gemini-Pro # Gemini-Pro
## Sample Usage ## Sample Usage
```python ```python
import litellm from litellm import completion
import os import os
os.environ['GEMINI_API_KEY'] = "" os.environ['GEMINI_API_KEY'] = ""
@ -24,7 +24,7 @@ LiteLLM Supports the following image types passed in `url`
## Sample Usage ## Sample Usage
```python ```python
import os import os
import litellm import litellm
from dotenv import load_dotenv from dotenv import load_dotenv
# Load the environment variables from .env file # Load the environment variables from .env file

View file

@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call | | Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------| |-----------------------|-----------------------------------------------------------------|
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` | | gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` | | gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` | | gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |

View file

@ -5,7 +5,7 @@
## Sample Usage ## Sample Usage
```python ```python
import litellm from litellm import completion
import os import os
os.environ['PALM_API_KEY'] = "" os.environ['PALM_API_KEY'] = ""
@ -17,7 +17,7 @@ response = completion(
## Sample Usage - Streaming ## Sample Usage - Streaming
```python ```python
import litellm from litellm import completion
import os import os
os.environ['PALM_API_KEY'] = "" os.environ['PALM_API_KEY'] = ""

View file

@ -17,7 +17,28 @@ import litellm
litellm.vertex_project = "hardy-device-38811" # Your Project ID litellm.vertex_project = "hardy-device-38811" # Your Project ID
litellm.vertex_location = "us-central1" # proj location litellm.vertex_location = "us-central1" # proj location
response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]) response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
```
## OpenAI Proxy Usage
1. Modify the config.yaml
```yaml
litellm_settings:
vertex_project: "hardy-device-38811" # Your Project ID
vertex_location: "us-central1" # proj location
model_list:
-model_name: team1-gemini-pro
litellm_params:
model: gemini-pro
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
``` ```
## Set Vertex Project & Vertex Location ## Set Vertex Project & Vertex Location

View file

@ -11,7 +11,7 @@ pip install litellm vllm
```python ```python
import litellm import litellm
response = completion( response = litellm.completion(
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
messages=messages, messages=messages,
temperature=0.2, temperature=0.2,
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
```python ```python
import litellm import litellm
response = completion( response = litellm.completion(
model="openai/facebook/opt-125m", # pass the vllm model name model="openai/facebook/opt-125m", # pass the vllm model name
messages=messages, messages=messages,
api_base="https://hosted-vllm-api.co", api_base="https://hosted-vllm-api.co",

View file

@ -1,6 +1,13 @@
# Slack Alerting # Slack Alerting
Get alerts for failed db read/writes, hanging api calls, failed api calls. Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
## Quick Start ## Quick Start

View file

@ -1,4 +1,4 @@
# Modify Incoming Data # Modify / Reject Incoming Requests
Modify data just before making litellm completion calls call on proxy Modify data just before making litellm completion calls call on proxy

View file

@ -22,18 +22,22 @@ Set a model alias for your deployments.
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
In the config below requests with: In the config below:
- `model_name`: the name to pass TO litellm from the external client
- `litellm_params.model`: the model string passed to the litellm.completion() function
E.g.:
- `model=vllm-models` will route to `openai/facebook/opt-125m`. - `model=vllm-models` will route to `openai/facebook/opt-125m`.
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca` - `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo # user-facing model alias - model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
model: azure/gpt-turbo-small-eu model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU") api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm) rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
- model_name: bedrock-claude-v1 - model_name: bedrock-claude-v1
litellm_params: litellm_params:
model: bedrock/anthropic.claude-instant-v1 model: bedrock/anthropic.claude-instant-v1
@ -43,6 +47,11 @@ model_list:
api_base: https://my-endpoint-canada-berri992.openai.azure.com/ api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_CA" api_key: "os.environ/AZURE_API_KEY_CA"
rpm: 6 rpm: 6
- model_name: anthropic-claude
litellm_params:
model="bedrock/anthropic.claude-instant-v1"
### [OPTIONAL] SET AWS REGION ###
aws_region_name="us-east-1"
- model_name: vllm-models - model_name: vllm-models
litellm_params: litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
general_settings: general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
``` ```
:::info
For more provider-specific info, [go here](../providers/)
:::
#### Step 2: Start Proxy with config #### Step 2: Start Proxy with config
@ -188,7 +202,7 @@ print(response)
</Tabs> </Tabs>
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.) ## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1) [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -210,6 +224,12 @@ model_list:
api_key: sk-123 api_key: sk-123
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/ api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
temperature: 0.2 temperature: 0.2
- model_name: openai-gpt-3.5
litellm_params:
model: openai/gpt-3.5-turbo
api_key: sk-123
organization: org-ikDc4ex8NB
temperature: 0.2
- model_name: mistral-7b - model_name: mistral-7b
litellm_params: litellm_params:
model: ollama/mistral model: ollama/mistral
@ -483,3 +503,55 @@ general_settings:
max_parallel_requests: 100 # max parallel requests for a user = 100 max_parallel_requests: 100 # max parallel requests for a user = 100
``` ```
## All settings
```python
{
"environment_variables": {},
"model_list": [
{
"model_name": "string",
"litellm_params": {},
"model_info": {
"id": "string",
"mode": "embedding",
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"max_tokens": 2048,
"base_model": "gpt-4-1106-preview",
"additionalProp1": {}
}
}
],
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
"general_settings": {
"completion_model": "string",
"key_management_system": "google_kms", # either google_kms or azure_kms
"master_key": "string",
"database_url": "string",
"database_type": "dynamo_db",
"database_args": {
"billing_mode": "PROVISIONED_THROUGHPUT",
"read_capacity_units": 0,
"write_capacity_units": 0,
"ssl_verify": true,
"region_name": "string",
"user_table_name": "LiteLLM_UserTable",
"key_table_name": "LiteLLM_VerificationToken",
"config_table_name": "LiteLLM_Config",
"spend_table_name": "LiteLLM_SpendLogs"
},
"otel": true,
"custom_auth": "string",
"max_parallel_requests": 0,
"infer_model_from_keys": true,
"background_health_checks": true,
"health_check_interval": 300,
"alerting": [
"string"
],
"alerting_threshold": 0
}
}
```

View file

@ -0,0 +1,115 @@
import Image from '@theme/IdealImage';
# Custom Pricing - Sagemaker, etc.
Use this to register custom pricing for models.
There's 2 ways to track cost:
- cost per token
- cost per second
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
## Quick Start
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
### Usage with OpenAI Proxy Server
**Step 1: Add pricing to config.yaml**
```yaml
model_list:
- model_name: sagemaker-completion-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_second: 0.000420
- model_name: sagemaker-embedding-model
litellm_params:
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
input_cost_per_second: 0.000420
```
**Step 2: Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3: View Spend Logs**
<Image img={require('../../img/spend_logs_table.png')} />
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
```yaml
model_list:
- model_name: azure-model
litellm_params:
model: azure/<your_deployment_name>
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
api_version: os.envrion/AZURE_API_VERSION
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
```

View file

@ -0,0 +1,34 @@
# Debugging
2 levels of debugging supported.
- debug (prints info logs)
- detailed debug (prints debug logs)
## `debug`
**via cli**
```bash
$ litellm --debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "INFO"
```
## `detailed debug`
**via cli**
```bash
$ litellm --detailed_debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "DEBUG"
```

View file

@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml
The proxy exposes: The proxy exposes:
* a /health endpoint which returns the health of the LLM APIs * a /health endpoint which returns the health of the LLM APIs
* a /test endpoint which makes a ping to the litellm server * a /health/readiness endpoint for returning if the proxy is ready to accept requests
* a /health/liveliness endpoint for returning if the proxy is alive
## `/health`
#### Request #### Request
Make a GET Request to `/health` on the proxy Make a GET Request to `/health` on the proxy
```shell ```shell
@ -39,7 +41,7 @@ litellm --health
} }
``` ```
## Background Health Checks ### Background Health Checks
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`. You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
curl --location 'http://0.0.0.0:8000/health' curl --location 'http://0.0.0.0:8000/health'
``` ```
## Embedding Models ### Embedding Models
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -77,7 +79,7 @@ model_list:
mode: embedding # 👈 ADD THIS mode: embedding # 👈 ADD THIS
``` ```
## Text Completion Models ### Text Completion Models
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -92,3 +94,54 @@ model_list:
model_info: model_info:
mode: completion # 👈 ADD THIS mode: completion # 👈 ADD THIS
``` ```
## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:8000/health/readiness'
```
Example Response:
*If proxy connected to a database*
```json
{
"status": "healthy",
"db": "connected",
"litellm_version":"1.19.2",
}
```
*If proxy not connected to a database*
```json
{
"status": "healthy",
"db": "Not connected",
"litellm_version":"1.19.2",
}
```
## `/health/liveliness`
Unprotected endpoint for checking if proxy is alive
Example Request:
```
curl -X 'GET' \
'http://0.0.0.0:8000/health/liveliness' \
-H 'accept: application/json'
```
Example Response:
```json
"I'm alive!"
```

View file

@ -1,5 +1,4 @@
# Multiple Instances of 1 model
# Load Balancing - Multiple Instances of 1 model
Load balance multiple instances of the same model Load balance multiple instances of the same model
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput** The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**

View file

@ -40,115 +40,6 @@ litellm --test
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints. This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
### Supported LLMs ### Supported LLMs
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers) All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
<Tabs> <Tabs>
@ -331,6 +222,113 @@ $ litellm --model command-nightly
</Tabs> </Tabs>
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
## Quick Start - LiteLLM Proxy + Config.yaml ## Quick Start - LiteLLM Proxy + Config.yaml

View file

@ -1,8 +1,8 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] Self-serve UI # [BETA] Admin UI
Allow your users to create their own keys through a UI
:::info :::info
@ -10,40 +10,94 @@ This is in beta, so things may change. If you have feedback, [let us know](https
::: :::
Allow your users to create, view their own keys through a UI
<Image img={require('../../img/admin_ui_2.png')} />
## Quick Start ## Quick Start
Requirements: ## 1. Setup SSO/Auth for UI
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp)) <Tabs>
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782) <TabItem value="google" label="Google SSO">
### Step 1. Save SMTP server credentials - Create a new Oauth 2.0 Client on https://console.cloud.google.com/
```env **Required .env variables on your Proxy**
export SMTP_HOST="my-smtp-host" ```shell
export SMTP_USERNAME="my-smtp-password" PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
export SMTP_PASSWORD="my-smtp-password"
export SMTP_SENDER_EMAIL="krrish@berri.ai" # for Google SSO Login
GOOGLE_CLIENT_ID=
GOOGLE_CLIENT_SECRET=
``` ```
### Step 2. Enable user auth - Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
https://litellm-production-7002.up.railway.app/sso/callback
```
In your config.yaml, </TabItem>
```yaml <TabItem value="msft" label="Microsoft SSO">
general_settings:
# other changes - Create a new App Registration on https://portal.azure.com/
allow_user_auth: true - Create a client Secret for your App Registration
**Required .env variables on your Proxy**
```shell
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
MICROSOFT_CLIENT_ID="84583a4d-"
MICROSOFT_CLIENT_SECRET="nbk8Q~"
MICROSOFT_TENANT="5a39737
```
- Set Redirect URI on your App Registration on https://portal.azure.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
http://localhost:4000/sso/callback
```
</TabItem>
<TabItem value="username" label="Quick Start - Username, Password">
Set the following in your .env on the Proxy
```shell
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
UI_USERNAME=ishaan-litellm
UI_PASSWORD=langchain
``` ```
This will enable: On accessing the LiteLLM UI, you will be prompted to enter your username, password
* Users to create keys via `/key/generate` (by default, only admin can create keys)
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
### Step 3. Connect to UI </TabItem>
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui). </Tabs>
## 2. Start Proxy Server
```shell
litellm --config proxy_config.yaml --port 4000
# start proxy on port 4000
```
## 3. Get Admin UI Link to you on Swagger
Your Proxy Swagger is available on the root of the Proxy: `http://localhost:4000/`
<Image img={require('../../img/ui_link.png')} />
<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`. If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
@ -62,4 +116,13 @@ Connect your proxy to your UI, by entering:
### Create Keys ### Create Keys
<Image img={require('../../img/user_create_key_screen.png')} /> <Image img={require('../../img/user_create_key_screen.png')} />
### Spend Per Key
<Image img={require('../../img/spend_per_api_key.png')} />
### Spend Per User
<Image img={require('../../img/spend_per_user.png')} /> -->

View file

@ -1,4 +1,7 @@
# 💰 Budgets, Rate Limits per user import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💰 Budgets, Rate Limits
Requirements: Requirements:
@ -6,17 +9,74 @@ Requirements:
## Set Budgets ## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. You can set budgets at 3 levels:
- For the proxy
- For a user
- For a key
<Tabs>
<TabItem value="proxy" label="For Proxy">
Apply a budget across all calls on the proxy
**Step 1. Modify config.yaml**
```yaml
general_settings:
master_key: sk-1234
litellm_settings:
# other litellm settings
max_budget: 0 # (float) sets max budget as $0 USD
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
**Step 2. Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
<TabItem value="per-user" label="For User">
Apply a budget across multiple keys.
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
You can:
- Add budgets to users [**Jump**](#add-budgets-to-users)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to users**
```shell ```shell
curl --location 'http://localhost:8000/user/new' \ curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \ --header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
``` ```
The request is a normal `/key/generate` request body + a `max_budget` field.
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Sample Response** **Sample Response**
@ -29,18 +89,163 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
} }
``` ```
### **Add budget duration to users**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
### Create new keys for existing user
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
```
</TabItem>
<TabItem value="per-key" label="For Key">
Apply a budget on a key.
You can:
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
**Expected Behaviour**
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
- If duration set, spend is reset at the end of the duration
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
}'
```
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <generated-key>' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
### **Add budget duration to keys**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
</TabItem>
</Tabs>
## Set Rate Limits ## Set Rate Limits
Set max parallel requests a user can make, when you create user keys - `/key/generate`. You can set:
- max parallel requests
- tpm limits
- rpm limits
<Tabs>
<TabItem value="per-user" label="Per User">
Use `/user/new`, to persist rate limits across multiple keys.
```shell
curl --location 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
```
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Expected Response**
```json
{
"key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
"expires": "2024-01-19T01:21:12.816168",
"user_id": "krrish@berri.ai",
}
```
</TabItem>
<TabItem value="per-key" label="Per Key">
Use `/key/generate`, if you want them for just that key.
```shell ```shell
curl --location 'http://0.0.0.0:8000/key/generate' \ curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \ --header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1 --data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
``` ```
**Expected Response**
```json
{
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
"expires": "2024-01-18T20:48:44.297973",
"user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84" // 👈 auto-generated
}
```
</TabItem>
</Tabs>
## Grant Access to new model ## Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.). Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).

View file

@ -1,4 +1,4 @@
# Key Management # Virtual Keys
Track Spend, Set budgets and create virtual keys for the proxy Track Spend, Set budgets and create virtual keys for the proxy
Grant other's temporary access to your proxy, with keys that expire after a set duration. Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
::: :::
## Quick Start ## Setup
Requirements: Requirements:
@ -58,36 +58,53 @@ litellm --config /path/to/config.yaml
curl 'http://0.0.0.0:8000/key/generate' \ curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \ --header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}' --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
``` ```
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ## /key/generate
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {} ### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"duration": "20m",
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
Expected response:
Request Params:
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
### Response
```python ```python
{ {
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
} }
``` ```
## Keys that don't expire ### Upgrade/Downgrade Models
Just set duration to None.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
```
## Upgrade/Downgrade Models
If a user is expected to use a given model (i.e. gpt3-5), and you want to: If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -137,7 +154,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) - **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
## Grant Access to new model ### Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.) Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
@ -165,6 +182,188 @@ curl --location 'http://localhost:8000/key/generate' \
"max_budget": 0,}' "max_budget": 0,}'
``` ```
## /key/info
### Request
```shell
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
Request Params:
- key: str - The key you want the info for
### Response
`token` is the hashed key (The DB stores the hashed key for security)
```json
{
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
"info": {
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
"spend": 0.0,
"expires": "2024-01-18T23:52:09.125000+00:00",
"models": ["azure-gpt-3.5", "azure-embedding-model"],
"aliases": {},
"config": {},
"user_id": "ishaan2@berri.ai",
"team_id": "None",
"max_parallel_requests": null,
"metadata": {}
}
}
```
## /key/update
### Request
```shell
curl 'http://0.0.0.0:8000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra"
}'
```
Request Params:
- key: str - The key that needs to be updated.
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
- team_id: str or null (optional) - Specify the team_id for the associated key.
### Response
```json
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {
"user": "ishaan@berri.ai"
}
}
```
## /key/delete
### Request
```shell
curl 'http://0.0.0.0:8000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}'
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
## Set Budgets - Per Key
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
#### Expected Behaviour
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
## Set Budgets - Per User
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Tracking Spend ## Tracking Spend
You can get spend for a key by using the `/key/info` endpoint. You can get spend for a key by using the `/key/info` endpoint.
@ -200,32 +399,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
``` ```
## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Custom Auth ## Custom Auth
You can now override the default api key auth. You can now override the default api key auth.
@ -275,6 +448,97 @@ general_settings:
$ litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
## Custom /key/generate
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
### 1. Write a custom `custom_generate_key_fn`
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
The output of your `custom_generate_key_fn` should be a dictionary with the following structure
```python
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
```python
async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
"""
Asynchronous function for generating a key based on the input data.
Args:
data (GenerateKeyRequest): The input data for key generation.
Returns:
dict: A dictionary containing the decision and an optional message.
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
"""
# decide if a key should be generated or not
print("using custom auth function!")
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
# only team_id="litellm-core-infra@gmail.com" can make keys
return {
"decision": True,
}
else:
print("Failed custom auth")
return {
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
```yaml
model_list:
- model_name: "openai-model"
litellm_params:
model: "gpt-3.5-turbo"
litellm_settings:
drop_params: True
set_verbose: True
general_settings:
custom_key_generate: custom_auth.custom_generate_key_fn
```
## [BETA] Dynamo DB ## [BETA] Dynamo DB

View file

@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
**Global Timeouts**
```python ```python
from litellm import Router from litellm import Router
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
print(response) print(response)
``` ```
**Timeouts per model**
```python
from litellm import Router
import asyncio
model_list = [{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"timeout": 300 # sets a 5 minute timeout
"stream_timeout": 30 # sets a 30s timeout for streaming calls
}
}]
# init router
router = Router(model_list=model_list, routing_strategy="least-busy")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
### Cooldowns ### Cooldowns
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
print(f"response: {response}") print(f"response: {response}")
``` ```
## Custom Callbacks - Track API Key, API Endpoint, Model Used
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
### Usage
```python
import litellm
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
print("kwargs=", kwargs)
litellm_params= kwargs.get("litellm_params")
api_key = litellm_params.get("api_key")
api_base = litellm_params.get("api_base")
custom_llm_provider= litellm_params.get("custom_llm_provider")
response_cost = kwargs.get("response_cost")
# print the values
print("api_key=", api_key)
print("api_base=", api_base)
print("custom_llm_provider=", custom_llm_provider)
print("response_cost=", response_cost)
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
print("kwargs=")
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
# Init Router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
# router completion call
response = router.completion(
model="gpt-3.5-turbo",
messages=[{ "role": "user", "content": "Hi who are you"}]
)
```
## Deploy Router ## Deploy Router
@ -602,17 +676,63 @@ def __init__(
num_retries: int = 0, num_retries: int = 0,
timeout: Optional[float] = None, timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create default_litellm_params={}, # default params for Router.chat.completion.create
set_verbose: bool = False,
fallbacks: List = [], fallbacks: List = [],
allowed_fails: Optional[int] = None, allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
context_window_fallbacks: List = [], context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {}, model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request retry_after: int = 0, # (min) time to wait before retrying a failed request
routing_strategy: Literal[ routing_strategy: Literal[
"simple-shuffle", "simple-shuffle",
"least-busy", "least-busy",
"usage-based-routing", "usage-based-routing",
"latency-based-routing", "latency-based-routing",
] = "simple-shuffle", ] = "simple-shuffle",
## DEBUGGING ##
set_verbose: bool = False, # set this to True for seeing logs
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
): ):
``` ```
## Debugging Router
### Basic Debugging
Set `Router(set_verbose=True)`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True
)
```
### Detailed Debugging
Set `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```
### Very Detailed Debugging
Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
import litellm
litellm.set_verbose = True
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View file

@ -104,24 +104,49 @@ const sidebars = {
items: [ items: [
"proxy/quick_start", "proxy/quick_start",
"proxy/configs", "proxy/configs",
{
type: 'link',
label: '📖 All Endpoints',
href: 'https://litellm-api.up.railway.app/',
},
"proxy/user_keys", "proxy/user_keys",
"proxy/load_balancing",
"proxy/virtual_keys", "proxy/virtual_keys",
"proxy/users", "proxy/users",
"proxy/ui", "proxy/ui",
"proxy/model_management", "proxy/model_management",
"proxy/reliability",
"proxy/caching",
"proxy/logging",
"proxy/health", "proxy/health",
"proxy/call_hooks", "proxy/debugging",
"proxy/rules", {
"proxy/alerting", "type": "category",
"proxy/streaming_logging", "label": "🔥 Load Balancing",
"items": [
"proxy/load_balancing",
"proxy/reliability",
]
},
{
"type": "category",
"label": "Logging, Alerting, Caching",
"items": [
"proxy/logging",
"proxy/alerting",
"proxy/streaming_logging",
"proxy/caching",
]
},
{
"type": "category",
"label": "Admin Controls",
"items": [
"proxy/call_hooks",
"proxy/rules",
]
},
"proxy/deploy", "proxy/deploy",
"proxy/cli", "proxy/cli",
] ]
}, },
"proxy/custom_pricing",
"routing", "routing",
"rules", "rules",
"set_keys", "set_keys",

View file

@ -2,10 +2,14 @@
import threading, requests import threading, requests
from typing import Callable, List, Optional, Dict, Union, Any from typing import Callable, List, Optional, Dict, Union, Any
from litellm.caching import Cache from litellm.caching import Cache
from litellm._logging import set_verbose from litellm._logging import set_verbose, _turn_on_debug
from litellm.proxy._types import KeyManagementSystem from litellm.proxy._types import KeyManagementSystem
import httpx import httpx
#############################################
if set_verbose == True:
_turn_on_debug()
#############################################
input_callback: List[Union[str, Callable]] = [] input_callback: List[Union[str, Callable]] = []
success_callback: List[Union[str, Callable]] = [] success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = [] failure_callback: List[Union[str, Callable]] = []
@ -58,6 +62,9 @@ cache: Optional[
model_alias_map: Dict[str, str] = {} model_alias_map: Dict[str, str] = {}
model_group_alias_map: Dict[str, str] = {} model_group_alias_map: Dict[str, str] = {}
max_budget: float = 0.0 # set the max budget across all providers max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[
str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
_openai_completion_params = [ _openai_completion_params = [
"functions", "functions",
"function_call", "function_call",
@ -136,6 +143,7 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
suppress_debug_info = False suppress_debug_info = False
dynamodb_table_name: Optional[str] = None dynamodb_table_name: Optional[str] = None
s3_callback_params: Optional[Dict] = None s3_callback_params: Optional[Dict] = None
default_key_generate_params: Optional[Dict] = None
#### RELIABILITY #### #### RELIABILITY ####
request_timeout: Optional[float] = 6000 request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint num_retries: Optional[int] = None # per model endpoint

View file

@ -7,20 +7,14 @@ handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG) handler.setLevel(logging.DEBUG)
# Create a formatter and set it for the handler # Create a formatter and set it for the handler
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
datefmt="%H:%M:%S",
)
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
handler.setFormatter(formatter) handler.setFormatter(formatter)
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy") verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
verbose_router_logger = logging.getLogger("LiteLLM Router") verbose_router_logger = logging.getLogger("LiteLLM Router")
verbose_logger = logging.getLogger("LiteLLM") verbose_logger = logging.getLogger("LiteLLM")
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
# Add the handler to the logger # Add the handler to the logger
verbose_router_logger.addHandler(handler) verbose_router_logger.addHandler(handler)
verbose_proxy_logger.addHandler(handler) verbose_proxy_logger.addHandler(handler)
verbose_logger.addHandler(handler)
def _turn_on_debug():
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to debug
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass

View file

@ -1,3 +1,12 @@
# +-----------------------------------------------+
# | |
# | NOT PROXY BUDGET MANAGER |
# | proxy budget manager is in proxy_server.py |
# | |
# +-----------------------------------------------+
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import os, json, time import os, json, time
import litellm import litellm
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
@ -11,10 +20,12 @@ class BudgetManager:
project_name: str, project_name: str,
client_type: str = "local", client_type: str = "local",
api_base: Optional[str] = None, api_base: Optional[str] = None,
headers: Optional[dict] = None,
): ):
self.client_type = client_type self.client_type = client_type
self.project_name = project_name self.project_name = project_name
self.api_base = api_base or "https://api.litellm.ai" self.api_base = api_base or "https://api.litellm.ai"
self.headers = headers or {"Content-Type": "application/json"}
## load the data or init the initial dictionaries ## load the data or init the initial dictionaries
self.load_data() self.load_data()
@ -43,7 +54,7 @@ class BudgetManager:
url = self.api_base + "/get_budget" url = self.api_base + "/get_budget"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name} data = {"project_name": self.project_name}
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=self.headers, json=data)
response = response.json() response = response.json()
if response["status"] == "error": if response["status"] == "error":
self.user_dict = ( self.user_dict = (
@ -201,6 +212,6 @@ class BudgetManager:
url = self.api_base + "/set_budget" url = self.api_base + "/set_budget"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name, "user_dict": self.user_dict} data = {"project_name": self.project_name, "user_dict": self.user_dict}
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=self.headers, json=data)
response = response.json() response = response.json()
return response return response

View file

@ -12,10 +12,12 @@ import time, logging
import json, traceback, ast, hashlib import json, traceback, ast, hashlib
from typing import Optional, Literal, List, Union, Any from typing import Optional, Literal, List, Union, Any
from openai._models import BaseModel as OpenAIObject from openai._models import BaseModel as OpenAIObject
from litellm._logging import verbose_logger
def print_verbose(print_statement): def print_verbose(print_statement):
try: try:
verbose_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose:
print(print_statement) # noqa print(print_statement) # noqa
except: except:
@ -129,11 +131,13 @@ class S3Cache(BaseCache):
s3_aws_secret_access_key=None, s3_aws_secret_access_key=None,
s3_aws_session_token=None, s3_aws_session_token=None,
s3_config=None, s3_config=None,
s3_path=None,
**kwargs, **kwargs,
): ):
import boto3 import boto3
self.bucket_name = s3_bucket_name self.bucket_name = s3_bucket_name
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
# Create an S3 client with custom endpoint URL # Create an S3 client with custom endpoint URL
self.s3_client = boto3.client( self.s3_client = boto3.client(
"s3", "s3",
@ -155,6 +159,8 @@ class S3Cache(BaseCache):
ttl = kwargs.get("ttl", None) ttl = kwargs.get("ttl", None)
# Convert value to JSON before storing in S3 # Convert value to JSON before storing in S3
serialized_value = json.dumps(value) serialized_value = json.dumps(value)
key = self.key_prefix + key
if ttl is not None: if ttl is not None:
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}" cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
import datetime import datetime
@ -171,7 +177,7 @@ class S3Cache(BaseCache):
CacheControl=cache_control, CacheControl=cache_control,
ContentType="application/json", ContentType="application/json",
ContentLanguage="en", ContentLanguage="en",
ContentDisposition=f"inline; filename=\"{key}.json\"" ContentDisposition=f'inline; filename="{key}.json"',
) )
else: else:
cache_control = "immutable, max-age=31536000, s-maxage=31536000" cache_control = "immutable, max-age=31536000, s-maxage=31536000"
@ -183,7 +189,7 @@ class S3Cache(BaseCache):
CacheControl=cache_control, CacheControl=cache_control,
ContentType="application/json", ContentType="application/json",
ContentLanguage="en", ContentLanguage="en",
ContentDisposition=f"inline; filename=\"{key}.json\"" ContentDisposition=f'inline; filename="{key}.json"',
) )
except Exception as e: except Exception as e:
# NON blocking - notify users S3 is throwing an exception # NON blocking - notify users S3 is throwing an exception
@ -193,6 +199,8 @@ class S3Cache(BaseCache):
import boto3, botocore import boto3, botocore
try: try:
key = self.key_prefix + key
print_verbose(f"Get S3 Cache: key: {key}") print_verbose(f"Get S3 Cache: key: {key}")
# Download the data from S3 # Download the data from S3
cached_response = self.s3_client.get_object( cached_response = self.s3_client.get_object(

View file

@ -8,6 +8,8 @@ from datetime import datetime
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
from packaging.version import Version from packaging.version import Version
from litellm._logging import verbose_logger
import litellm
class LangFuseLogger: class LangFuseLogger:
@ -33,6 +35,26 @@ class LangFuseLogger:
debug=self.langfuse_debug, debug=self.langfuse_debug,
) )
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
self.upstream_langfuse_secret_key = os.getenv(
"UPSTREAM_LANGFUSE_SECRET_KEY"
)
self.upstream_langfuse_public_key = os.getenv(
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
)
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
self.upstream_langfuse = Langfuse(
public_key=self.upstream_langfuse_public_key,
secret_key=self.upstream_langfuse_secret_key,
host=self.upstream_langfuse_host,
release=self.upstream_langfuse_release,
debug=self.upstream_langfuse_debug,
)
else:
self.upstream_langfuse = None
def log_event( def log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
): ):
@ -62,11 +84,15 @@ class LangFuseLogger:
pass pass
# end of processing langfuse ######################## # end of processing langfuse ########################
input = prompt if kwargs.get("call_type", None) == "embedding" or isinstance(
output = response_obj["choices"][0]["message"].json() response_obj, litellm.EmbeddingResponse
print_verbose( ):
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}" input = prompt
) output = response_obj["data"]
else:
input = prompt
output = response_obj["choices"][0]["message"].json()
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
self._log_langfuse_v2( self._log_langfuse_v2(
user_id, user_id,
metadata, metadata,
@ -77,6 +103,7 @@ class LangFuseLogger:
optional_params, optional_params,
input, input,
response_obj, response_obj,
print_verbose,
) if self._is_langfuse_v2() else self._log_langfuse_v1( ) if self._is_langfuse_v2() else self._log_langfuse_v1(
user_id, user_id,
metadata, metadata,
@ -93,6 +120,7 @@ class LangFuseLogger:
print_verbose( print_verbose(
f"Langfuse Layer Logging - final response object: {response_obj}" f"Langfuse Layer Logging - final response object: {response_obj}"
) )
verbose_logger.info(f"Langfuse Layer Logging - logging success")
except: except:
traceback.print_exc() traceback.print_exc()
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}") print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
@ -165,28 +193,39 @@ class LangFuseLogger:
optional_params, optional_params,
input, input,
response_obj, response_obj,
print_verbose,
): ):
import langfuse import langfuse
tags = [] tags = []
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3") supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
generation_name = metadata.get("generation_name", None)
if generation_name is None:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_params = { trace_params = {
"name": metadata.get("generation_name", "litellm-completion"), "name": generation_name,
"input": input, "input": input,
"output": output, "output": output,
"user_id": metadata.get("trace_user_id", user_id), "user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None), "id": metadata.get("trace_id", None),
} }
cost = kwargs["response_cost"]
print_verbose(f"trace: {cost}")
if supports_tags: if supports_tags:
for key, value in metadata.items(): for key, value in metadata.items():
tags.append(f"{key}:{value}") tags.append(f"{key}:{value}")
if "cache_hit" in kwargs:
tags.append(f"cache_hit:{kwargs['cache_hit']}")
trace_params.update({"tags": tags}) trace_params.update({"tags": tags})
trace = self.Langfuse.trace(**trace_params) trace = self.Langfuse.trace(**trace_params)
trace.generation( trace.generation(
name=metadata.get("generation_name", "litellm-completion"), name=generation_name,
id=metadata.get("generation_id", None), id=metadata.get("generation_id", None),
startTime=start_time, startTime=start_time,
endTime=end_time, endTime=end_time,
@ -197,6 +236,30 @@ class LangFuseLogger:
usage={ usage={
"prompt_tokens": response_obj["usage"]["prompt_tokens"], "prompt_tokens": response_obj["usage"]["prompt_tokens"],
"completion_tokens": response_obj["usage"]["completion_tokens"], "completion_tokens": response_obj["usage"]["completion_tokens"],
"total_cost": cost if supports_costs else None,
}, },
metadata=metadata, metadata=metadata,
) )
if self.upstream_langfuse:
# user wants to log RAW LLM API call in 2nd langfuse project
# key change - model=response_obj["model"], instead of input model used
# this is useful for litellm proxy, where users need to see analytics on their LLM Endpoints
trace = self.upstream_langfuse.trace(**trace_params)
trace.generation(
name=generation_name,
id=metadata.get("generation_id", None),
startTime=start_time,
endTime=end_time,
model=response_obj["model"],
modelParameters=optional_params,
input=input,
output=output,
usage={
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
"completion_tokens": response_obj["usage"]["completion_tokens"],
},
metadata=metadata,
)

View file

@ -13,19 +13,22 @@ class LangsmithLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self): def __init__(self):
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY") self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
self.langsmith_default_run_name = os.getenv(
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
)
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
# Method definition # Method definition
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
metadata = {} metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {} # if metadata is None
if "litellm_params" in kwargs:
metadata = kwargs["litellm_params"].get("metadata", {})
# set project name and run_name for langsmith logging # set project name and run_name for langsmith logging
# users can pass project_name and run name to litellm.completion() # users can pass project_name and run name to litellm.completion()
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"}) # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
# if not set litellm will use default project_name = litellm-completion, run_name = LLMRun # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
project_name = metadata.get("project_name", "litellm-completion") project_name = metadata.get("project_name", self.langsmith_project)
run_name = metadata.get("run_name", "LLMRun") run_name = metadata.get("run_name", self.langsmith_default_run_name)
print_verbose( print_verbose(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}" f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
) )

View file

@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import datetime, subprocess, sys import datetime, subprocess, sys
import litellm, uuid import litellm, uuid
from litellm._logging import print_verbose from litellm._logging import print_verbose, verbose_logger
class S3Logger: class S3Logger:
@ -31,7 +31,9 @@ class S3Logger:
import boto3 import boto3
try: try:
print_verbose("in init s3 logger") verbose_logger.debug(
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
)
if litellm.s3_callback_params is not None: if litellm.s3_callback_params is not None:
# read in .env variables - example os.environ/AWS_BUCKET_NAME # read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -42,7 +44,7 @@ class S3Logger:
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name") s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
s3_region_name = litellm.s3_callback_params.get("s3_region_name") s3_region_name = litellm.s3_callback_params.get("s3_region_name")
s3_api_version = litellm.s3_callback_params.get("s3_api_version") s3_api_version = litellm.s3_callback_params.get("s3_api_version")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl") s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
s3_verify = litellm.s3_callback_params.get("s3_verify") s3_verify = litellm.s3_callback_params.get("s3_verify")
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url") s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
s3_aws_access_key_id = litellm.s3_callback_params.get( s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -59,6 +61,7 @@ class S3Logger:
self.bucket_name = s3_bucket_name self.bucket_name = s3_bucket_name
self.s3_path = s3_path self.s3_path = s3_path
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
# Create an S3 client with custom endpoint URL # Create an S3 client with custom endpoint URL
self.s3_client = boto3.client( self.s3_client = boto3.client(
"s3", "s3",
@ -84,7 +87,9 @@ class S3Logger:
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
try: try:
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}") verbose_logger.debug(
f"s3 Logging - Enters logging function for model {kwargs}"
)
# construct payload to send to s3 # construct payload to send to s3
# follows the same params as langfuse.py # follows the same params as langfuse.py
@ -129,6 +134,7 @@ class S3Logger:
+ "-time=" + "-time="
+ str(start_time) + str(start_time)
) # we need the s3 key to include the time, so we log cache hits too ) # we need the s3 key to include the time, so we log cache hits too
s3_object_key += ".json"
import json import json
@ -151,5 +157,5 @@ class S3Logger:
return response return response
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}") verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
pass pass

View file

@ -78,7 +78,7 @@ class AnthropicConfig:
# makes headers for API call # makes headers for API call
def validate_environment(api_key): def validate_environment(api_key, user_headers):
if api_key is None: if api_key is None:
raise ValueError( raise ValueError(
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params" "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
@ -89,6 +89,8 @@ def validate_environment(api_key):
"content-type": "application/json", "content-type": "application/json",
"x-api-key": api_key, "x-api-key": api_key,
} }
if user_headers is not None and isinstance(user_headers, dict):
headers = {**headers, **user_headers}
return headers return headers
@ -105,8 +107,9 @@ def completion(
optional_params=None, optional_params=None,
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,
headers={},
): ):
headers = validate_environment(api_key) headers = validate_environment(api_key, headers)
if model in custom_prompt_dict: if model in custom_prompt_dict:
# check if the model has a registered custom prompt # check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model] model_prompt_details = custom_prompt_dict[model]
@ -139,7 +142,11 @@ def completion(
logging_obj.pre_call( logging_obj.pre_call(
input=prompt, input=prompt,
api_key=api_key, api_key=api_key,
additional_args={"complete_input_dict": data, "api_base": api_base}, additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
) )
## COMPLETION CALL ## COMPLETION CALL

View file

@ -629,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
client_session = litellm.aclient_session or httpx.AsyncClient( client_session = litellm.aclient_session or httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(),
) )
openai_aclient = AsyncAzureOpenAI( azure_client = AsyncAzureOpenAI(
http_client=client_session, **azure_client_params http_client=client_session, **azure_client_params
) )
else: else:
openai_aclient = client azure_client = client
response = await openai_aclient.images.generate(**data, timeout=timeout) ## LOGGING
logging_obj.pre_call(
input=data["prompt"],
api_key=azure_client.api_key,
additional_args={
"headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference,
"acompletion": True,
"complete_input_dict": data,
},
)
response = await azure_client.images.generate(**data, timeout=timeout)
stringified_response = response.model_dump() stringified_response = response.model_dump()
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
@ -719,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
input=prompt, input=prompt,
api_key=azure_client.api_key, api_key=azure_client.api_key,
additional_args={ additional_args={
"headers": {"Authorization": f"Bearer {azure_client.api_key}"}, "headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference, "api_base": azure_client._base_url._uri_reference,
"acompletion": False, "acompletion": False,
"complete_input_dict": data, "complete_input_dict": data,

View file

@ -659,9 +659,16 @@ def completion(
) )
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = response_metadata.get(
completion_tokens = len( "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
encoding.encode(model_response["choices"][0]["message"].get("content", "")) )
completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count",
len(
encoding.encode(
model_response["choices"][0]["message"].get("content", "")
)
),
) )
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
@ -672,6 +679,8 @@ def completion(
total_tokens=prompt_tokens + completion_tokens, total_tokens=prompt_tokens + completion_tokens,
) )
model_response.usage = usage model_response.usage = usage
model_response._hidden_params["region_name"] = client.meta.region_name
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
return model_response return model_response
except BedrockError as e: except BedrockError as e:
exception_mapping_worked = True exception_mapping_worked = True
@ -693,6 +702,11 @@ def _embedding_func_single(
encoding=None, encoding=None,
logging_obj=None, logging_obj=None,
): ):
if isinstance(input, str) is False:
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
# logic for parsing in - calling - parsing out model embedding calls # logic for parsing in - calling - parsing out model embedding calls
## FORMAT EMBEDDING INPUT ## ## FORMAT EMBEDDING INPUT ##
provider = model.split(".")[0] provider = model.split(".")[0]
@ -786,7 +800,8 @@ def embedding(
aws_role_name=aws_role_name, aws_role_name=aws_role_name,
aws_session_name=aws_session_name, aws_session_name=aws_session_name,
) )
if type(input) == str: if isinstance(input, str):
## Embedding Call
embeddings = [ embeddings = [
_embedding_func_single( _embedding_func_single(
model, model,
@ -796,8 +811,8 @@ def embedding(
logging_obj=logging_obj, logging_obj=logging_obj,
) )
] ]
else: elif isinstance(input, list):
## Embedding Call ## Embedding Call - assuming this is a List[str]
embeddings = [ embeddings = [
_embedding_func_single( _embedding_func_single(
model, model,
@ -808,6 +823,12 @@ def embedding(
) )
for i in input for i in input
] # [TODO]: make these parallel calls ] # [TODO]: make these parallel calls
else:
# enters this branch if input = int, ex. input=2
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
## Populate OpenAI compliant dictionary ## Populate OpenAI compliant dictionary
embedding_response = [] embedding_response = []

View file

@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
request=request, request=request,
) )
time.sleep(int(response.headers.get("retry-after")) or 10) await asyncio.sleep(int(response.headers.get("retry-after") or 10))
response = await super().handle_async_request(request) response = await super().handle_async_request(request)
await response.aread() await response.aread()
@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
request.method = "GET" request.method = "GET"
response = super().handle_request(request) response = super().handle_request(request)
response.read() response.read()
timeout_secs: int = 120 timeout_secs: int = 120
start_time = time.time() start_time = time.time()
while response.json()["status"] not in ["succeeded", "failed"]: while response.json()["status"] not in ["succeeded", "failed"]:
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
content=json.dumps(timeout).encode("utf-8"), content=json.dumps(timeout).encode("utf-8"),
request=request, request=request,
) )
time.sleep(int(response.headers.get("retry-after", None) or 10))
time.sleep(int(response.headers.get("retry-after")) or 10)
response = super().handle_request(request) response = super().handle_request(request)
response.read() response.read()
if response.json()["status"] == "failed": if response.json()["status"] == "failed":
error_data = response.json() error_data = response.json()
return httpx.Response( return httpx.Response(

View file

@ -120,9 +120,7 @@ def completion(
## Load Config ## Load Config
inference_params = copy.deepcopy(optional_params) inference_params = copy.deepcopy(optional_params)
inference_params.pop( stream = inference_params.pop("stream", None)
"stream", None
) # palm does not support streaming, so we handle this by fake streaming in main.py
config = litellm.GeminiConfig.get_config() config = litellm.GeminiConfig.get_config()
for k, v in config.items(): for k, v in config.items():
if ( if (
@ -139,10 +137,18 @@ def completion(
## COMPLETION CALL ## COMPLETION CALL
try: try:
_model = genai.GenerativeModel(f"models/{model}") _model = genai.GenerativeModel(f"models/{model}")
response = _model.generate_content( if stream != True:
contents=prompt, response = _model.generate_content(
generation_config=genai.types.GenerationConfig(**inference_params), contents=prompt,
) generation_config=genai.types.GenerationConfig(**inference_params),
)
else:
response = _model.generate_content(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
stream=True,
)
return response
except Exception as e: except Exception as e:
raise GeminiError( raise GeminiError(
message=str(e), message=str(e),
@ -177,16 +183,20 @@ def completion(
try: try:
completion_response = model_response["choices"][0]["message"].get("content") completion_response = model_response["choices"][0]["message"].get("content")
if completion_response is None: if completion_response is None:
raise Exception raise Exception
except: except:
original_response = f"response: {response}" original_response = f"response: {response}"
if hasattr(response, "candidates"): if hasattr(response, "candidates"):
original_response = f"response: {response.candidates}" original_response = f"response: {response.candidates}"
if "SAFETY" in original_response: if "SAFETY" in original_response:
original_response += "\nThe candidate content was flagged for safety reasons." original_response += (
"\nThe candidate content was flagged for safety reasons."
)
elif "RECITATION" in original_response: elif "RECITATION" in original_response:
original_response += "\nThe candidate content was flagged for recitation reasons." original_response += (
"\nThe candidate content was flagged for recitation reasons."
)
raise GeminiError( raise GeminiError(
status_code=400, status_code=400,
message=f"No response received. Original response - {original_response}", message=f"No response received. Original response - {original_response}",

View file

@ -145,8 +145,8 @@ def get_ollama_response(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in ): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v optional_params[k] = v
optional_params["stream"] = optional_params.get("stream", False) stream = optional_params.pop("stream", False)
data = {"model": model, "messages": messages, **optional_params} data = {"model": model, "messages": messages, "options": optional_params}
## LOGGING ## LOGGING
logging_obj.pre_call( logging_obj.pre_call(
input=None, input=None,
@ -159,7 +159,7 @@ def get_ollama_response(
}, },
) )
if acompletion is True: if acompletion is True:
if optional_params.get("stream", False) == True: if stream == True:
response = ollama_async_streaming( response = ollama_async_streaming(
url=url, url=url,
data=data, data=data,
@ -176,7 +176,7 @@ def get_ollama_response(
logging_obj=logging_obj, logging_obj=logging_obj,
) )
return response return response
elif optional_params.get("stream", False) == True: elif stream == True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj) return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post( response = requests.post(
@ -220,8 +220,10 @@ def get_ollama_response(
model_response["choices"][0]["message"] = response_json["message"] model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model model_response["model"] = "ollama/" + model
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
model_response["choices"][0]["message"] = response_json["message"] model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"] model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,

View file

@ -1,5 +1,5 @@
from typing import Optional, Union, Any from typing import Optional, Union, Any
import types, time, json import types, time, json, traceback
import httpx import httpx
from .base import BaseLLM from .base import BaseLLM
from litellm.utils import ( from litellm.utils import (
@ -221,6 +221,7 @@ class OpenAIChatCompletion(BaseLLM):
headers: Optional[dict] = None, headers: Optional[dict] = None,
custom_prompt_dict: dict = {}, custom_prompt_dict: dict = {},
client=None, client=None,
organization: Optional[str] = None,
): ):
super().completion() super().completion()
exception_mapping_worked = False exception_mapping_worked = False
@ -254,6 +255,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout, timeout=timeout,
client=client, client=client,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
return self.acompletion( return self.acompletion(
@ -266,6 +268,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout, timeout=timeout,
client=client, client=client,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
elif optional_params.get("stream", False): elif optional_params.get("stream", False):
return self.streaming( return self.streaming(
@ -278,6 +281,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout, timeout=timeout,
client=client, client=client,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
if not isinstance(max_retries, int): if not isinstance(max_retries, int):
@ -291,6 +295,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session, http_client=litellm.client_session,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
openai_client = client openai_client = client
@ -349,7 +354,7 @@ class OpenAIChatCompletion(BaseLLM):
if hasattr(e, "status_code"): if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e)) raise OpenAIError(status_code=e.status_code, message=str(e))
else: else:
raise OpenAIError(status_code=500, message=str(e)) raise OpenAIError(status_code=500, message=traceback.format_exc())
async def acompletion( async def acompletion(
self, self,
@ -358,6 +363,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout: float, timeout: float,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None, client=None,
max_retries=None, max_retries=None,
logging_obj=None, logging_obj=None,
@ -372,6 +378,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session, http_client=litellm.aclient_session,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
openai_aclient = client openai_aclient = client
@ -412,6 +419,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str, model: str,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None, client=None,
max_retries=None, max_retries=None,
headers=None, headers=None,
@ -423,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session, http_client=litellm.client_session,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
openai_client = client openai_client = client
@ -454,6 +463,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str, model: str,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None, client=None,
max_retries=None, max_retries=None,
headers=None, headers=None,
@ -467,6 +477,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session, http_client=litellm.aclient_session,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
) )
else: else:
openai_aclient = client openai_aclient = client
@ -706,19 +717,34 @@ class OpenAIChatCompletion(BaseLLM):
## COMPLETION CALL ## COMPLETION CALL
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
response = response.model_dump() # type: ignore
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
input=input, input=prompt,
api_key=api_key, api_key=api_key,
additional_args={"complete_input_dict": data}, additional_args={"complete_input_dict": data},
original_response=response, original_response=response,
) )
# return response # return response
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e: except OpenAIError as e:
exception_mapping_worked = True exception_mapping_worked = True
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
raise e raise e
except Exception as e: except Exception as e:
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
if hasattr(e, "status_code"): if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e)) raise OpenAIError(status_code=e.status_code, message=str(e))
else: else:
@ -733,8 +759,11 @@ class OpenAIChatCompletion(BaseLLM):
messages: Optional[list] = None, messages: Optional[list] = None,
input: Optional[list] = None, input: Optional[list] = None,
prompt: Optional[str] = None, prompt: Optional[str] = None,
organization: Optional[str] = None,
): ):
client = AsyncOpenAI(api_key=api_key, timeout=timeout) client = AsyncOpenAI(
api_key=api_key, timeout=timeout, organization=organization
)
if model is None and mode != "image_generation": if model is None and mode != "image_generation":
raise Exception("model is not set") raise Exception("model is not set")

View file

@ -99,12 +99,16 @@ def ollama_pt(
def mistral_instruct_pt(messages): def mistral_instruct_pt(messages):
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
prompt = custom_prompt( prompt = custom_prompt(
initial_prompt_value="<s>", initial_prompt_value="<s>",
role_dict={ role_dict={
"system": {"pre_message": "[INST]", "post_message": "[/INST]"}, "system": {
"user": {"pre_message": "[INST]", "post_message": "[/INST]"}, "pre_message": "[INST] \n",
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"}, "post_message": " [/INST]\n",
},
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
"assistant": {"pre_message": " ", "post_message": " "},
}, },
final_prompt_value="</s>", final_prompt_value="</s>",
messages=messages, messages=messages,
@ -372,6 +376,7 @@ def anthropic_pt(
You can "put words in Claude's mouth" by ending with an assistant message. You can "put words in Claude's mouth" by ending with an assistant message.
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
""" """
class AnthropicConstants(Enum): class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman: " HUMAN_PROMPT = "\n\nHuman: "
AI_PROMPT = "\n\nAssistant: " AI_PROMPT = "\n\nAssistant: "
@ -394,32 +399,35 @@ def anthropic_pt(
prompt += f"{AnthropicConstants.AI_PROMPT.value}" prompt += f"{AnthropicConstants.AI_PROMPT.value}"
return prompt return prompt
def _load_image_from_url(image_url): def _load_image_from_url(image_url):
try: try:
from PIL import Image from PIL import Image
except: except:
raise Exception("gemini image conversion failed please run `pip install Pillow`") raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
from io import BytesIO from io import BytesIO
try: try:
# Send a GET request to the image URL # Send a GET request to the image URL
response = requests.get(image_url) response = requests.get(image_url)
response.raise_for_status() # Raise an exception for HTTP errors response.raise_for_status() # Raise an exception for HTTP errors
# Check the response's content type to ensure it is an image # Check the response's content type to ensure it is an image
content_type = response.headers.get('content-type') content_type = response.headers.get("content-type")
if not content_type or 'image' not in content_type: if not content_type or "image" not in content_type:
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})") raise ValueError(
f"URL does not point to a valid image (content-type: {content_type})"
)
# Load the image from the response content # Load the image from the response content
return Image.open(BytesIO(response.content)) return Image.open(BytesIO(response.content))
except requests.RequestException as e: except requests.RequestException as e:
print(f"Request failed: {e}") raise Exception(f"Request failed: {e}")
except UnidentifiedImageError: except Exception as e:
print("Cannot identify image file (it may not be a supported image format or might be corrupted).") raise e
except ValueError as e:
print(e)
def _gemini_vision_convert_messages(messages: list): def _gemini_vision_convert_messages(messages: list):
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
try: try:
from PIL import Image from PIL import Image
except: except:
raise Exception("gemini image conversion failed please run `pip install Pillow`") raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
try: try:
# given messages for gpt-4 vision, convert them for gemini # given messages for gpt-4 vision, convert them for gemini
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb # https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
prompt = "" prompt = ""
@ -589,7 +598,7 @@ def prompt_factory(
if custom_llm_provider == "ollama": if custom_llm_provider == "ollama":
return ollama_pt(model=model, messages=messages) return ollama_pt(model=model, messages=messages)
elif custom_llm_provider == "anthropic": elif custom_llm_provider == "anthropic":
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]): if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
return claude_2_1_pt(messages=messages) return claude_2_1_pt(messages=messages)
else: else:
return anthropic_pt(messages=messages) return anthropic_pt(messages=messages)

View file

@ -25,6 +25,46 @@ class SagemakerError(Exception):
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
self.end_of_data = False
def __iter__(self):
return self
def __next__(self):
try:
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
response_obj = {"text": "", "is_finished": False}
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
if line_data.get("generated_text", None) is not None:
self.end_of_data = True
response_obj["is_finished"] = True
response_obj["text"] = line_data["token"]["text"]
return response_obj
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
except StopIteration as e:
if self.end_of_data == True:
raise e # Re-raise StopIteration
else:
self.end_of_data = True
return "data: [DONE]"
class SagemakerConfig: class SagemakerConfig:
""" """
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
@ -121,7 +161,6 @@ def completion(
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
inference_params = deepcopy(optional_params) inference_params = deepcopy(optional_params)
inference_params.pop("stream", None)
## Load Config ## Load Config
config = litellm.SagemakerConfig.get_config() config = litellm.SagemakerConfig.get_config()
@ -152,6 +191,28 @@ def completion(
hf_model_name or model hf_model_name or model
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt) ) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
prompt = prompt_factory(model=hf_model_name, messages=messages) prompt = prompt_factory(model=hf_model_name, messages=messages)
stream = inference_params.pop("stream", None)
if stream == True:
data = json.dumps(
{"inputs": prompt, "parameters": inference_params, "stream": True}
).encode("utf-8")
## LOGGING
request_str = f"""
response = client.invoke_endpoint_with_response_stream(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
return response["Body"]
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode( data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
"utf-8" "utf-8"

View file

View file

@ -237,8 +237,11 @@ def completion(
GenerationConfig, GenerationConfig,
) )
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
import google.auth
vertexai.init(project=vertex_project, location=vertex_location) ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
creds, _ = google.auth.default(quota_project_id=vertex_project)
vertexai.init(project=vertex_project, location=vertex_location, credentials=creds)
## Load Config ## Load Config
config = litellm.VertexAIConfig.get_config() config = litellm.VertexAIConfig.get_config()

View file

@ -10,12 +10,11 @@
import os, openai, sys, json, inspect, uuid, datetime, threading import os, openai, sys, json, inspect, uuid, datetime, threading
from typing import Any, Literal, Union from typing import Any, Literal, Union
from functools import partial from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy from copy import deepcopy
import httpx import httpx
import litellm import litellm
from ._logging import verbose_logger
from litellm import ( # type: ignore from litellm import ( # type: ignore
client, client,
exception_type, exception_type,
@ -83,6 +82,7 @@ from litellm.utils import (
TextCompletionResponse, TextCompletionResponse,
TextChoices, TextChoices,
EmbeddingResponse, EmbeddingResponse,
ImageResponse,
read_config_args, read_config_args,
Choices, Choices,
Message, Message,
@ -273,14 +273,10 @@ async def acompletion(
else: else:
# Call the synchronous function using run_in_executor # Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context) # type: ignore response = await loop.run_in_executor(None, func_with_context) # type: ignore
# if kwargs.get("stream", False): # return an async generator if isinstance(response, CustomStreamWrapper):
# return _async_streaming( response.set_logging_event_loop(
# response=response, loop=loop
# model=model, ) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
# custom_llm_provider=custom_llm_provider,
# args=args,
# )
# else:
return response return response
except Exception as e: except Exception as e:
custom_llm_provider = custom_llm_provider or "openai" custom_llm_provider = custom_llm_provider or "openai"
@ -343,6 +339,18 @@ def mock_completion(
model_response["choices"][0]["message"]["content"] = mock_response model_response["choices"][0]["message"]["content"] = mock_response
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = model model_response["model"] = model
model_response.usage = Usage(
prompt_tokens=10, completion_tokens=20, total_tokens=30
)
try:
_, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
except:
# dont let setting a hidden param block a mock_respose
pass
return model_response return model_response
except: except:
@ -442,9 +450,12 @@ def completion(
num_retries = kwargs.get("num_retries", None) ## deprecated num_retries = kwargs.get("num_retries", None) ## deprecated
max_retries = kwargs.get("max_retries", None) max_retries = kwargs.get("max_retries", None)
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None) context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
organization = kwargs.get("organization", None)
### CUSTOM MODEL COST ### ### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None) input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None) output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
### CUSTOM PROMPT TEMPLATE ### ### CUSTOM PROMPT TEMPLATE ###
initial_prompt_value = kwargs.get("initial_prompt_value", None) initial_prompt_value = kwargs.get("initial_prompt_value", None)
roles = kwargs.get("roles", None) roles = kwargs.get("roles", None)
@ -522,6 +533,8 @@ def completion(
"tpm", "tpm",
"input_cost_per_token", "input_cost_per_token",
"output_cost_per_token", "output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name", "hf_model_name",
"model_info", "model_info",
"proxy_server_request", "proxy_server_request",
@ -534,10 +547,6 @@ def completion(
non_default_params = { non_default_params = {
k: v for k, v in kwargs.items() if k not in default_params k: v for k, v in kwargs.items() if k not in default_params
} # model-specific params - pass them straight to the model/provider } # model-specific params - pass them straight to the model/provider
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if timeout is None: if timeout is None:
timeout = ( timeout = (
kwargs.get("request_timeout", None) or 600 kwargs.get("request_timeout", None) or 600
@ -577,15 +586,43 @@ def completion(
) )
if model_response is not None and hasattr(model_response, "_hidden_params"): if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
model_response._hidden_params["region_name"] = kwargs.get(
"aws_region_name", None
) # support region-based pricing for bedrock
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ### ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None: if input_cost_per_token is not None and output_cost_per_token is not None:
print_verbose(f"Registering model={model} in model cost map")
litellm.register_model( litellm.register_model(
{ {
f"{custom_llm_provider}/{model}": {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
},
model: { model: {
"input_cost_per_token": input_cost_per_token, "input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token, "output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider, "litellm_provider": custom_llm_provider,
} },
}
)
elif (
input_cost_per_second is not None
): # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
},
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
},
} }
) )
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ### ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
@ -674,6 +711,10 @@ def completion(
optional_params=optional_params, optional_params=optional_params,
litellm_params=litellm_params, litellm_params=litellm_params,
) )
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if custom_llm_provider == "azure": if custom_llm_provider == "azure":
# azure configs # azure configs
api_type = get_secret("AZURE_API_TYPE") or "azure" api_type = get_secret("AZURE_API_TYPE") or "azure"
@ -692,9 +733,9 @@ def completion(
or get_secret("AZURE_API_KEY") or get_secret("AZURE_API_KEY")
) )
azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret( azure_ad_token = optional_params.get("extra_body", {}).pop(
"AZURE_AD_TOKEN" "azure_ad_token", None
) ) or get_secret("AZURE_AD_TOKEN")
headers = headers or litellm.headers headers = headers or litellm.headers
@ -758,7 +799,8 @@ def completion(
or "https://api.openai.com/v1" or "https://api.openai.com/v1"
) )
openai.organization = ( openai.organization = (
litellm.organization organization
or litellm.organization
or get_secret("OPENAI_ORGANIZATION") or get_secret("OPENAI_ORGANIZATION")
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105 or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
) )
@ -798,6 +840,7 @@ def completion(
timeout=timeout, timeout=timeout,
custom_prompt_dict=custom_prompt_dict, custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
) )
except Exception as e: except Exception as e:
## LOGGING - log the original exception returned ## LOGGING - log the original exception returned
@ -967,6 +1010,7 @@ def completion(
encoding=encoding, # for calculating input/output tokens encoding=encoding, # for calculating input/output tokens
api_key=api_key, api_key=api_key,
logging_obj=logging, logging_obj=logging,
headers=headers,
) )
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
@ -1376,11 +1420,29 @@ def completion(
acompletion=acompletion, acompletion=acompletion,
custom_prompt_dict=custom_prompt_dict, custom_prompt_dict=custom_prompt_dict,
) )
if (
"stream" in optional_params
and optional_params["stream"] == True
and acompletion == False
):
response = CustomStreamWrapper(
iter(model_response),
model,
custom_llm_provider="gemini",
logging_obj=logging,
)
return response
response = model_response response = model_response
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT") vertex_ai_project = (
vertex_ai_location = litellm.vertex_location or get_secret( optional_params.pop("vertex_ai_project", None)
"VERTEXAI_LOCATION" or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
) )
model_response = vertex_ai.completion( model_response = vertex_ai.completion(
@ -1471,19 +1533,22 @@ def completion(
if ( if (
"stream" in optional_params and optional_params["stream"] == True "stream" in optional_params and optional_params["stream"] == True
): ## [BETA] ): ## [BETA]
# sagemaker does not support streaming as of now so we're faking streaming:
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
# "SageMaker is currently not supporting streaming responses."
# fake streaming for sagemaker
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER") print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
resp_string = model_response["choices"][0]["message"]["content"] from .llms.sagemaker import TokenIterator
tokenIterator = TokenIterator(model_response)
response = CustomStreamWrapper( response = CustomStreamWrapper(
resp_string, completion_stream=tokenIterator,
model, model=model,
custom_llm_provider="sagemaker", custom_llm_provider="sagemaker",
logging_obj=logging, logging_obj=logging,
) )
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
return response return response
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -2176,6 +2241,7 @@ def embedding(
model, model,
input=[], input=[],
# Optional params # Optional params
dimensions: Optional[int] = None,
timeout=600, # default to 10 minutes timeout=600, # default to 10 minutes
# set api_base, api_version, api_key # set api_base, api_version, api_key
api_base: Optional[str] = None, api_base: Optional[str] = None,
@ -2196,6 +2262,7 @@ def embedding(
Parameters: Parameters:
- model: The embedding model to use. - model: The embedding model to use.
- input: The input for which embeddings are to be generated. - input: The input for which embeddings are to be generated.
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins - timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging. - litellm_call_id: The call ID for litellm logging.
- litellm_logging_obj: The litellm logging object. - litellm_logging_obj: The litellm logging object.
@ -2222,8 +2289,14 @@ def embedding(
encoding_format = kwargs.get("encoding_format", None) encoding_format = kwargs.get("encoding_format", None)
proxy_server_request = kwargs.get("proxy_server_request", None) proxy_server_request = kwargs.get("proxy_server_request", None)
aembedding = kwargs.get("aembedding", None) aembedding = kwargs.get("aembedding", None)
### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
openai_params = [ openai_params = [
"user", "user",
"dimensions",
"request_timeout", "request_timeout",
"api_base", "api_base",
"api_version", "api_version",
@ -2270,6 +2343,8 @@ def embedding(
"tpm", "tpm",
"input_cost_per_token", "input_cost_per_token",
"output_cost_per_token", "output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name", "hf_model_name",
"proxy_server_request", "proxy_server_request",
"model_info", "model_info",
@ -2290,11 +2365,35 @@ def embedding(
api_key=api_key, api_key=api_key,
) )
optional_params = get_optional_params_embeddings( optional_params = get_optional_params_embeddings(
model=model,
user=user, user=user,
dimensions=dimensions,
encoding_format=encoding_format, encoding_format=encoding_format,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
**non_default_params, **non_default_params,
) )
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None:
litellm.register_model(
{
model: {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
}
}
)
if input_cost_per_second is not None: # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
}
}
)
try: try:
response = None response = None
logging = litellm_logging_obj logging = litellm_logging_obj
@ -2916,6 +3015,7 @@ def image_generation(
else: else:
model = "dall-e-2" model = "dall-e-2"
custom_llm_provider = "openai" # default to dall-e-2 on openai custom_llm_provider = "openai" # default to dall-e-2 on openai
model_response._hidden_params["model"] = model
openai_params = [ openai_params = [
"user", "user",
"request_timeout", "request_timeout",
@ -2989,7 +3089,7 @@ def image_generation(
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
**non_default_params, **non_default_params,
) )
logging = litellm_logging_obj logging: Logging = litellm_logging_obj
logging.update_environment_variables( logging.update_environment_variables(
model=model, model=model,
user=user, user=user,
@ -3089,6 +3189,9 @@ async def ahealth_check(
if model is None: if model is None:
raise Exception("model not set") raise Exception("model not set")
if model in litellm.model_cost and mode is None:
mode = litellm.model_cost[model]["mode"]
model, custom_llm_provider, _, _ = get_llm_provider(model=model) model, custom_llm_provider, _, _ = get_llm_provider(model=model)
mode = mode or "chat" # default to chat completion calls mode = mode or "chat" # default to chat completion calls
@ -3135,6 +3238,7 @@ async def ahealth_check(
or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "text-completion-openai"
): ):
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY") api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
organization = model_params.get("organization")
timeout = ( timeout = (
model_params.get("timeout") model_params.get("timeout")
@ -3152,6 +3256,7 @@ async def ahealth_check(
mode=mode, mode=mode,
prompt=prompt, prompt=prompt,
input=input, input=input,
organization=organization,
) )
else: else:
if mode == "embedding": if mode == "embedding":
@ -3176,6 +3281,7 @@ async def ahealth_check(
## Set verbose to true -> ```litellm.set_verbose = True``` ## Set verbose to true -> ```litellm.set_verbose = True```
def print_verbose(print_statement): def print_verbose(print_statement):
try: try:
verbose_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose:
print(print_statement) # noqa print(print_statement) # noqa
except: except:
@ -3263,8 +3369,20 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
return response return response
def stream_chunk_builder(chunks: list, messages: Optional[list] = None): def stream_chunk_builder(
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
):
model_response = litellm.ModelResponse() model_response = litellm.ModelResponse()
### SORT CHUNKS BASED ON CREATED ORDER ##
print_verbose("Goes into checking if chunk has hiddden created at param")
if chunks[0]._hidden_params.get("created_at", None):
print_verbose("Chunks have a created at hidden param")
# Sort chunks based on created_at in ascending order
chunks = sorted(
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
)
print_verbose("Chunks sorted")
# set hidden params from chunk to model_response # set hidden params from chunk to model_response
if model_response is not None and hasattr(model_response, "_hidden_params"): if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params = chunks[0].get("_hidden_params", {}) model_response._hidden_params = chunks[0].get("_hidden_params", {})
@ -3438,5 +3556,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"] response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
) )
return convert_to_model_response_object( return convert_to_model_response_object(
response_object=response, model_response_object=model_response response_object=response,
model_response_object=model_response,
start_time=start_time,
end_time=end_time,
) )

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-9a890acb1e81c3fc.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

View file

@ -0,0 +1 @@
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[165],{3155:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found",function(){return n(4032)}])},4032:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return i}}),n(6921);let o=n(3827);n(4090);let r={error:{fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},desc:{display:"inline-block"},h1:{display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},h2:{fontSize:14,fontWeight:400,lineHeight:"49px",margin:0}};function i(){return(0,o.jsxs)(o.Fragment,{children:[(0,o.jsx)("title",{children:"404: This page could not be found."}),(0,o.jsx)("div",{style:r.error,children:(0,o.jsxs)("div",{children:[(0,o.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,o.jsx)("h1",{className:"next-error-h1",style:r.h1,children:"404"}),(0,o.jsx)("div",{style:r.desc,children:(0,o.jsx)("h2",{style:r.h2,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,69,744],function(){return e(e.s=3155)}),_N_E=e.O()}]);

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{7421:function(n,e,t){Promise.resolve().then(t.t.bind(t,9646,23)),Promise.resolve().then(t.t.bind(t,3385,23))},3385:function(){},9646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=7421)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{2028:function(e,n,t){Promise.resolve().then(t.t.bind(t,7690,23)),Promise.resolve().then(t.t.bind(t,8955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,1902,23)),Promise.resolve().then(t.t.bind(t,1778,23)),Promise.resolve().then(t.t.bind(t,7831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(5317),n(2028)}),_N_E=e.O()}]);

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[888],{1597:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_app",function(){return u(7174)}])}},function(n){var _=function(_){return n(n.s=_)};n.O(0,[774,179],function(){return _(1597),_(4546)}),_N_E=n.O()}]);

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[820],{1981:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_error",function(){return u(5103)}])}},function(n){n.O(0,[888,774,179],function(){return n(n.s=1981)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function s(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={exports:{}},r=!0;try{a[e](n,n.exports,s),r=!1}finally{r&&delete l[e]}return n.exports}s.m=a,e=[],s.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(s.O).every(function(e){return s.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},s.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return s.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},s.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);s.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},s.d(o,u),o},s.d=function(e,t){for(var n in t)s.o(t,n)&&!s.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},s.f={},s.e=function(e){return Promise.all(Object.keys(s.f).reduce(function(t,n){return s.f[n](e,t),t},[]))},s.u=function(e){},s.miniCssF=function(e){return"static/css/7384ba6288e79f81.css"},s.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),s.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",s.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,s.nc&&i.setAttribute("nonce",s.nc),i.setAttribute("data-webpack",o+n),i.src=s.tu(e)),r[e]=[t];var d=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(d.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=d.bind(null,i.onerror),i.onload=d.bind(null,i.onload),c&&document.head.appendChild(i)},s.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},s.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},s.tu=function(e){return s.tt().createScriptURL(e)},s.p="/ui/_next/",i={272:0},s.f.j=function(e,t){var n=s.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=s.p+s.u(e),u=Error();s.l(o,function(t){if(s.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},s.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)s.o(u,n)&&(s.m[n]=u[n]);if(c)var a=c(s)}for(e&&e(t);f<o.length;f++)r=o[f],s.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return s.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,8 @@
2:"$Sreact.suspense"
3:I[5250,["291","static/chunks/291-b42f47441ebb3671.js","931","static/chunks/app/page-b376373c879283de.js"],""]
4:I[7476,["291","static/chunks/291-b42f47441ebb3671.js","931","static/chunks/app/page-b376373c879283de.js"],""]
5:I[5613,[],""]
6:I[1778,[],""]
0:["FyY3KX_tIybXFPWL2RGlt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col items-center","children":[["$","nav",null,{"className":"left-0 right-0 top-0 flex justify-between items-center h-12","children":[["$","div",null,{"className":"text-left mx-4 my-2 absolute top-0 left-0","children":["$","div",null,{"className":"flex flex-col items-center","children":["$","$L3",null,{"href":"/","children":["$","button",null,{"className":"text-gray-800 text-2xl px-4 py-1 rounded text-center","children":"🚅 LiteLLM"}]}]}]}],["$","div",null,{"className":"text-right mx-4 my-2 absolute top-0 right-0","children":[["$","a",null,{"href":"https://docs.litellm.ai/docs/","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 mr-2 text-center","children":"Docs"}]}],["$","a",null,{"href":"https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 text-center","children":"Schedule Demo"}]}]]}]]}],["$","$L4",null,{}]]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/7384ba6288e79f81.css","precedence":"next","crossOrigin":""}]],"$L7"]]]]
7:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

View file

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 283 64"><path fill="black" d="M141 16c-11 0-19 7-19 18s9 18 20 18c7 0 13-3 16-7l-7-5c-2 3-6 4-9 4-5 0-9-3-10-7h28v-3c0-11-8-18-19-18zm-9 15c1-4 4-7 9-7s8 3 9 7h-18zm117-15c-11 0-19 7-19 18s9 18 20 18c6 0 12-3 16-7l-8-5c-2 3-5 4-8 4-5 0-9-3-11-7h28l1-3c0-11-8-18-19-18zm-10 15c2-4 5-7 10-7s8 3 9 7h-19zm-39 3c0 6 4 10 10 10 4 0 7-2 9-5l8 5c-3 5-9 8-17 8-11 0-19-7-19-18s8-18 19-18c8 0 14 3 17 8l-8 5c-2-3-5-5-9-5-6 0-10 4-10 10zm83-29v46h-9V5h9zM37 0l37 64H0L37 0zm92 5-27 48L74 5h10l18 30 17-30h10zm59 12v10l-3-1c-6 0-10 4-10 10v15h-9V17h9v9c0-5 6-9 13-9z"/></svg>

After

Width:  |  Height:  |  Size: 629 B

View file

@ -1,8 +1,17 @@
from pydantic import BaseModel, Extra, Field, root_validator from pydantic import BaseModel, Extra, Field, root_validator, Json
import enum import enum
from typing import Optional, List, Union, Dict, Literal from typing import Optional, List, Union, Dict, Literal, Any
from datetime import datetime from datetime import datetime
import uuid, json import uuid, json, sys, os
def hash_token(token: str):
import hashlib
# Hash the string using SHA-256
hashed_token = hashlib.sha256(token.encode()).hexdigest()
return hashed_token
class LiteLLMBase(BaseModel): class LiteLLMBase(BaseModel):
@ -13,7 +22,7 @@ class LiteLLMBase(BaseModel):
def json(self, **kwargs): def json(self, **kwargs):
try: try:
return self.model_dump() # noqa return self.model_dump() # noqa
except: except Exception as e:
# if using pydantic v1 # if using pydantic v1
return self.dict() return self.dict()
@ -122,53 +131,64 @@ class ModelParams(LiteLLMBase):
return values return values
class GenerateKeyRequest(LiteLLMBase): class GenerateRequestBase(LiteLLMBase):
duration: Optional[str] = "1h" """
Overlapping schema between key and user generate/update requests
"""
models: Optional[list] = [] models: Optional[list] = []
spend: Optional[float] = 0
max_budget: Optional[float] = None
user_id: Optional[str] = None
team_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
tpm_limit: Optional[int] = None
rpm_limit: Optional[int] = None
budget_duration: Optional[str] = None
allowed_cache_controls: Optional[list] = []
class GenerateKeyRequest(GenerateRequestBase):
key_alias: Optional[str] = None
duration: Optional[str] = None
aliases: Optional[dict] = {} aliases: Optional[dict] = {}
config: Optional[dict] = {} config: Optional[dict] = {}
spend: Optional[float] = 0
user_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
class UpdateKeyRequest(LiteLLMBase): class GenerateKeyResponse(GenerateKeyRequest):
key: str
duration: Optional[str] = None
models: Optional[list] = None
aliases: Optional[dict] = None
config: Optional[dict] = None
spend: Optional[float] = None
user_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
metadata: Optional[dict] = {}
class UserAPIKeyAuth(LiteLLMBase): # the expected response object for user api key auth
"""
Return the row in the db
"""
api_key: Optional[str] = None
models: list = []
aliases: dict = {}
config: dict = {}
spend: Optional[float] = 0
user_id: Optional[str] = None
max_parallel_requests: Optional[int] = None
duration: str = "1h"
metadata: dict = {}
class GenerateKeyResponse(LiteLLMBase):
key: str key: str
key_name: Optional[str] = None
expires: Optional[datetime] expires: Optional[datetime]
user_id: str user_id: str
@root_validator(pre=True)
def set_model_info(cls, values):
if values.get("token") is not None:
values.update({"key": values.get("token")})
dict_fields = ["metadata", "aliases", "config"]
for field in dict_fields:
value = values.get(field)
if value is not None and isinstance(value, str):
try:
values[field] = json.loads(value)
except json.JSONDecodeError:
raise ValueError(f"Field {field} should be a valid dictionary")
return values
class UpdateKeyRequest(GenerateKeyRequest):
# Note: the defaults of all Params here MUST BE NONE
# else they will get overwritten
key: str
duration: Optional[str] = None
spend: Optional[float] = None
metadata: Optional[dict] = None
class DeleteKeyRequest(LiteLLMBase): class DeleteKeyRequest(LiteLLMBase):
keys: List[str] keys: List
class NewUserRequest(GenerateKeyRequest): class NewUserRequest(GenerateKeyRequest):
@ -179,6 +199,14 @@ class NewUserResponse(GenerateKeyResponse):
max_budget: Optional[float] = None max_budget: Optional[float] = None
class UpdateUserRequest(GenerateRequestBase):
# Note: the defaults of all Params here MUST BE NONE
# else they will get overwritten
user_id: str
spend: Optional[float] = None
metadata: Optional[dict] = None
class KeyManagementSystem(enum.Enum): class KeyManagementSystem(enum.Enum):
GOOGLE_KMS = "google_kms" GOOGLE_KMS = "google_kms"
AZURE_KEY_VAULT = "azure_key_vault" AZURE_KEY_VAULT = "azure_key_vault"
@ -194,6 +222,7 @@ class DynamoDBArgs(LiteLLMBase):
user_table_name: str = "LiteLLM_UserTable" user_table_name: str = "LiteLLM_UserTable"
key_table_name: str = "LiteLLM_VerificationToken" key_table_name: str = "LiteLLM_VerificationToken"
config_table_name: str = "LiteLLM_Config" config_table_name: str = "LiteLLM_Config"
spend_table_name: str = "LiteLLM_SpendLogs"
class ConfigGeneralSettings(LiteLLMBase): class ConfigGeneralSettings(LiteLLMBase):
@ -282,15 +311,39 @@ class ConfigYAML(LiteLLMBase):
class LiteLLM_VerificationToken(LiteLLMBase): class LiteLLM_VerificationToken(LiteLLMBase):
token: str token: Optional[str] = None
key_name: Optional[str] = None
key_alias: Optional[str] = None
spend: float = 0.0 spend: float = 0.0
expires: Union[str, None] max_budget: Optional[float] = None
models: List[str] expires: Optional[str] = None
aliases: Dict[str, str] = {} models: List = []
config: Dict[str, str] = {} aliases: Dict = {}
user_id: Union[str, None] config: Dict = {}
max_parallel_requests: Union[int, None] user_id: Optional[str] = None
metadata: Dict[str, str] = {} max_parallel_requests: Optional[int] = None
metadata: Dict = {}
tpm_limit: Optional[int] = None
rpm_limit: Optional[int] = None
budget_duration: Optional[str] = None
budget_reset_at: Optional[datetime] = None
allowed_cache_controls: Optional[list] = []
class UserAPIKeyAuth(
LiteLLM_VerificationToken
): # the expected response object for user api key auth
"""
Return the row in the db
"""
api_key: Optional[str] = None
@root_validator(pre=True)
def check_api_key(cls, values):
if values.get("api_key") is not None:
values.update({"token": hash_token(values.get("api_key"))})
return values
class LiteLLM_Config(LiteLLMBase): class LiteLLM_Config(LiteLLMBase):
@ -310,5 +363,22 @@ class LiteLLM_UserTable(LiteLLMBase):
if values.get("spend") is None: if values.get("spend") is None:
values.update({"spend": 0.0}) values.update({"spend": 0.0})
if values.get("models") is None: if values.get("models") is None:
values.update({"models", []}) values.update({"models": []})
return values return values
class LiteLLM_SpendLogs(LiteLLMBase):
request_id: str
api_key: str
model: Optional[str] = ""
call_type: str
spend: Optional[float] = 0.0
total_tokens: Optional[int] = 0
prompt_tokens: Optional[int] = 0
completion_tokens: Optional[int] = 0
startTime: Union[str, datetime, None]
endTime: Union[str, datetime, None]
user: Optional[str] = ""
metadata: Optional[Json] = {}
cache_hit: Optional[str] = "False"
cache_key: Optional[str] = None

View file

@ -98,7 +98,7 @@ def list_models():
st.error(f"An error occurred while requesting models: {e}") st.error(f"An error occurred while requesting models: {e}")
else: else:
st.warning( st.warning(
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page." f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
) )
@ -151,7 +151,7 @@ def create_key():
raise e raise e
else: else:
st.warning( st.warning(
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page." f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
) )

View file

@ -5,6 +5,7 @@ from litellm.proxy._types import (
LiteLLM_Config, LiteLLM_Config,
LiteLLM_UserTable, LiteLLM_UserTable,
) )
from litellm.proxy.utils import hash_token
from litellm import get_secret from litellm import get_secret
from typing import Any, List, Literal, Optional, Union from typing import Any, List, Literal, Optional, Union
import json import json
@ -131,10 +132,27 @@ class DynamoDBWrapper(CustomDB):
raise Exception( raise Exception(
f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'" f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'"
) )
## Spend
try:
verbose_proxy_logger.debug("DynamoDB Wrapper - Creating Spend Table")
error_occurred = False
table = client.table(self.database_arguments.spend_table_name)
if not await table.exists():
await table.create(
self.throughput_type,
KeySchema(hash_key=KeySpec("request_id", KeyType.string)),
)
except Exception as e:
error_occurred = True
if error_occurred == True:
raise Exception(
f"Failed to create table - {self.database_arguments.key_table_name}.\nPlease create a new table called {self.database_arguments.key_table_name}\nAND set `hash_key` as 'token'"
)
verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()") verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()")
async def insert_data( async def insert_data(
self, value: Any, table_name: Literal["user", "key", "config"] self, value: Any, table_name: Literal["user", "key", "config", "spend"]
): ):
from aiodynamo.client import Client from aiodynamo.client import Client
from aiodynamo.credentials import Credentials, StaticCredentials from aiodynamo.credentials import Credentials, StaticCredentials
@ -166,8 +184,13 @@ class DynamoDBWrapper(CustomDB):
table = client.table(self.database_arguments.key_table_name) table = client.table(self.database_arguments.key_table_name)
elif table_name == "config": elif table_name == "config":
table = client.table(self.database_arguments.config_table_name) table = client.table(self.database_arguments.config_table_name)
elif table_name == "spend":
table = client.table(self.database_arguments.spend_table_name)
value = value.copy()
for k, v in value.items(): for k, v in value.items():
if k == "token" and value[k].startswith("sk-"):
value[k] = hash_token(token=v)
if isinstance(v, datetime): if isinstance(v, datetime):
value[k] = v.isoformat() value[k] = v.isoformat()
@ -224,6 +247,10 @@ class DynamoDBWrapper(CustomDB):
and isinstance(v, str) and isinstance(v, str)
): ):
new_response[k] = json.loads(v) new_response[k] = json.loads(v)
elif (k == "tpm_limit" or k == "rpm_limit") and isinstance(
v, float
):
new_response[k] = int(v)
else: else:
new_response[k] = v new_response[k] = v
new_response = LiteLLM_VerificationToken(**new_response) new_response = LiteLLM_VerificationToken(**new_response)
@ -281,10 +308,13 @@ class DynamoDBWrapper(CustomDB):
# Initialize an empty UpdateExpression # Initialize an empty UpdateExpression
actions: List = [] actions: List = []
value = value.copy()
for k, v in value.items(): for k, v in value.items():
# Convert datetime object to ISO8601 string # Convert datetime object to ISO8601 string
if isinstance(v, datetime): if isinstance(v, datetime):
v = v.isoformat() v = v.isoformat()
if k == "token" and value[k].startswith("sk-"):
value[k] = hash_token(token=v)
# Accumulate updates # Accumulate updates
actions.append((F(k), Value(value=v))) actions.append((F(k), Value(value=v)))

View file

@ -1,4 +1,4 @@
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth, GenerateKeyRequest
from fastapi import Request from fastapi import Request
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
@ -14,3 +14,40 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
raise Exception raise Exception
except: except:
raise Exception raise Exception
async def generate_key_fn(data: GenerateKeyRequest):
"""
Asynchronously decides if a key should be generated or not based on the provided data.
Args:
data (GenerateKeyRequest): The data to be used for decision making.
Returns:
bool: True if a key should be generated, False otherwise.
"""
# decide if a key should be generated or not
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and len(team_id) > 0:
return {
"decision": True,
}
else:
return {
"decision": True,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}

View file

@ -0,0 +1,55 @@
# What this does?
## Checks if key is allowed to use the cache controls passed in to the completion() call
from typing import Optional
import litellm
from litellm.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth
from litellm.integrations.custom_logger import CustomLogger
from fastapi import HTTPException
import json, traceback
class CacheControlCheck(CustomLogger):
# Class variables or attributes
def __init__(self):
pass
def print_verbose(self, print_statement):
if litellm.set_verbose is True:
print(print_statement) # noqa
async def async_pre_call_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
cache: DualCache,
data: dict,
call_type: str,
):
try:
self.print_verbose(f"Inside Cache Control Check Pre-Call Hook")
allowed_cache_controls = user_api_key_dict.allowed_cache_controls
if (allowed_cache_controls is None) or (
len(allowed_cache_controls) == 0
): # assume empty list to be nullable - https://github.com/prisma/prisma/issues/847#issuecomment-546895663
return
if data.get("cache", None) is None:
return
cache_args = data.get("cache", None)
if isinstance(cache_args, dict):
for k, v in cache_args.items():
if k not in allowed_cache_controls:
raise HTTPException(
status_code=403,
detail=f"Not allowed to set {k} as a cache control. Contact admin to change permissions.",
)
else: # invalid cache
return
except HTTPException as e:
raise e
except Exception as e:
traceback.print_exc()

View file

@ -1,9 +1,12 @@
from typing import Optional from typing import Optional
import litellm import litellm, traceback, sys
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from fastapi import HTTPException from fastapi import HTTPException
from litellm._logging import verbose_proxy_logger
from litellm import ModelResponse
from datetime import datetime
class MaxParallelRequestsHandler(CustomLogger): class MaxParallelRequestsHandler(CustomLogger):
@ -14,8 +17,12 @@ class MaxParallelRequestsHandler(CustomLogger):
pass pass
def print_verbose(self, print_statement): def print_verbose(self, print_statement):
if litellm.set_verbose is True: try:
print(print_statement) # noqa verbose_proxy_logger.debug(print_statement)
if litellm.set_verbose:
print(print_statement) # noqa
except:
pass
async def async_pre_call_hook( async def async_pre_call_hook(
self, self,
@ -26,25 +33,56 @@ class MaxParallelRequestsHandler(CustomLogger):
): ):
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook") self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
api_key = user_api_key_dict.api_key api_key = user_api_key_dict.api_key
max_parallel_requests = user_api_key_dict.max_parallel_requests max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
tpm_limit = user_api_key_dict.tpm_limit or sys.maxsize
rpm_limit = user_api_key_dict.rpm_limit or sys.maxsize
if api_key is None: if api_key is None:
return return
if max_parallel_requests is None: if (
max_parallel_requests == sys.maxsize
and tpm_limit == sys.maxsize
and rpm_limit == sys.maxsize
):
return return
self.user_api_key_cache = cache # save the api key cache for updating the value self.user_api_key_cache = cache # save the api key cache for updating the value
# ------------
# Setup values
# ------------
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = f"{api_key}::{precise_minute}::request_count"
# CHECK IF REQUEST ALLOWED # CHECK IF REQUEST ALLOWED
request_count_api_key = f"{api_key}_request_count" current = cache.get_cache(
current = cache.get_cache(key=request_count_api_key) key=request_count_api_key
) # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
self.print_verbose(f"current: {current}") self.print_verbose(f"current: {current}")
if current is None: if current is None:
cache.set_cache(request_count_api_key, 1) new_val = {
elif int(current) < max_parallel_requests: "current_requests": 1,
"current_tpm": 0,
"current_rpm": 0,
}
cache.set_cache(request_count_api_key, new_val)
elif (
int(current["current_requests"]) < max_parallel_requests
and current["current_tpm"] < tpm_limit
and current["current_rpm"] < rpm_limit
):
# Increase count for this token # Increase count for this token
cache.set_cache(request_count_api_key, int(current) + 1) new_val = {
"current_requests": current["current_requests"] + 1,
"current_tpm": current["current_tpm"],
"current_rpm": current["current_rpm"],
}
cache.set_cache(request_count_api_key, new_val)
else: else:
raise HTTPException( raise HTTPException(
status_code=429, detail="Max parallel request limit reached." status_code=429, detail="Max parallel request limit reached."
@ -52,7 +90,7 @@ class MaxParallelRequestsHandler(CustomLogger):
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try: try:
self.print_verbose(f"INSIDE ASYNC SUCCESS LOGGING") self.print_verbose(f"INSIDE parallel request limiter ASYNC SUCCESS LOGGING")
user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"] user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
if user_api_key is None: if user_api_key is None:
return return
@ -60,29 +98,50 @@ class MaxParallelRequestsHandler(CustomLogger):
if self.user_api_key_cache is None: if self.user_api_key_cache is None:
return return
request_count_api_key = f"{user_api_key}_request_count" # ------------
# check if it has collected an entire stream response # Setup values
self.print_verbose( # ------------
f"'complete_streaming_response' is in kwargs: {'complete_streaming_response' in kwargs}"
) current_date = datetime.now().strftime("%Y-%m-%d")
if "complete_streaming_response" in kwargs or kwargs["stream"] != True: current_hour = datetime.now().strftime("%H")
# Decrease count for this token current_minute = datetime.now().strftime("%M")
current = ( precise_minute = f"{current_date}-{current_hour}-{current_minute}"
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
) total_tokens = 0
new_val = current - 1
self.print_verbose(f"updated_value in success call: {new_val}") if isinstance(response_obj, ModelResponse):
self.user_api_key_cache.set_cache(request_count_api_key, new_val) total_tokens = response_obj.usage.total_tokens
request_count_api_key = f"{user_api_key}::{precise_minute}::request_count"
current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
"current_requests": 1,
"current_tpm": total_tokens,
"current_rpm": 1,
}
# ------------
# Update usage
# ------------
new_val = {
"current_requests": current["current_requests"] - 1,
"current_tpm": current["current_tpm"] + total_tokens,
"current_rpm": current["current_rpm"] + 1,
}
self.print_verbose(f"updated_value in success call: {new_val}")
self.user_api_key_cache.set_cache(
request_count_api_key, new_val, ttl=60
) # store in cache for 1 min.
except Exception as e: except Exception as e:
self.print_verbose(e) # noqa self.print_verbose(e) # noqa
async def async_log_failure_call( async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
self, user_api_key_dict: UserAPIKeyAuth, original_exception: Exception
):
try: try:
self.print_verbose(f"Inside Max Parallel Request Failure Hook") self.print_verbose(f"Inside Max Parallel Request Failure Hook")
api_key = user_api_key_dict.api_key user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
if api_key is None: if user_api_key is None:
return return
if self.user_api_key_cache is None: if self.user_api_key_cache is None:
@ -90,19 +149,46 @@ class MaxParallelRequestsHandler(CustomLogger):
## decrement call count if call failed ## decrement call count if call failed
if ( if (
hasattr(original_exception, "status_code") hasattr(kwargs["exception"], "status_code")
and original_exception.status_code == 429 and kwargs["exception"].status_code == 429
and "Max parallel request limit reached" in str(original_exception) and "Max parallel request limit reached" in str(kwargs["exception"])
): ):
pass # ignore failed calls due to max limit being reached pass # ignore failed calls due to max limit being reached
else: else:
request_count_api_key = f"{api_key}_request_count" # ------------
# Decrease count for this token # Setup values
current = ( # ------------
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
request_count_api_key = (
f"{user_api_key}::{precise_minute}::request_count"
) )
new_val = current - 1
# ------------
# Update usage
# ------------
current = self.user_api_key_cache.get_cache(
key=request_count_api_key
) or {
"current_requests": 1,
"current_tpm": 0,
"current_rpm": 0,
}
new_val = {
"current_requests": current["current_requests"] - 1,
"current_tpm": current["current_tpm"],
"current_rpm": current["current_rpm"],
}
self.print_verbose(f"updated_value in failure call: {new_val}") self.print_verbose(f"updated_value in failure call: {new_val}")
self.user_api_key_cache.set_cache(request_count_api_key, new_val) self.user_api_key_cache.set_cache(
request_count_api_key, new_val, ttl=60
) # save in cache for up to 1 min.
except Exception as e: except Exception as e:
self.print_verbose(f"An exception occurred - {str(e)}") # noqa print(f"An exception occurred - {str(e)}") # noqa

View file

@ -157,6 +157,12 @@ def is_port_in_use(port):
type=int, type=int,
help="Number of requests to hit async endpoint with", help="Number of requests to hit async endpoint with",
) )
@click.option(
"--run_gunicorn",
default=False,
is_flag=True,
help="Starts proxy via gunicorn, instead of uvicorn (better for managing multiple workers)",
)
@click.option("--local", is_flag=True, default=False, help="for local debugging") @click.option("--local", is_flag=True, default=False, help="for local debugging")
def run_server( def run_server(
host, host,
@ -186,21 +192,32 @@ def run_server(
use_queue, use_queue,
health, health,
version, version,
run_gunicorn,
): ):
global feature_telemetry global feature_telemetry
args = locals() args = locals()
if local: if local:
from proxy_server import app, save_worker_config, usage_telemetry from proxy_server import app, save_worker_config, usage_telemetry, ProxyConfig
else: else:
try: try:
from .proxy_server import app, save_worker_config, usage_telemetry from .proxy_server import (
app,
save_worker_config,
usage_telemetry,
ProxyConfig,
)
except ImportError as e: except ImportError as e:
if "litellm[proxy]" in str(e): if "litellm[proxy]" in str(e):
# user is missing a proxy dependency, ask them to pip install litellm[proxy] # user is missing a proxy dependency, ask them to pip install litellm[proxy]
raise e raise e
else: else:
# this is just a local/relative import error, user git cloned litellm # this is just a local/relative import error, user git cloned litellm
from proxy_server import app, save_worker_config, usage_telemetry from proxy_server import (
app,
save_worker_config,
usage_telemetry,
ProxyConfig,
)
feature_telemetry = usage_telemetry feature_telemetry = usage_telemetry
if version == True: if version == True:
pkg_version = importlib.metadata.version("litellm") pkg_version = importlib.metadata.version("litellm")
@ -373,16 +390,16 @@ def run_server(
read from there and save it to os.env['DATABASE_URL'] read from there and save it to os.env['DATABASE_URL']
""" """
try: try:
import yaml import yaml, asyncio
except: except:
raise ImportError( raise ImportError(
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`" "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
) )
if os.path.exists(config): proxy_config = ProxyConfig()
with open(config, "r") as config_file: _, _, general_settings = asyncio.run(
config = yaml.safe_load(config_file) proxy_config.load_config(router=None, config_file_path=config)
general_settings = config.get("general_settings", {}) )
database_url = general_settings.get("database_url", None) database_url = general_settings.get("database_url", None)
if database_url and database_url.startswith("os.environ/"): if database_url and database_url.startswith("os.environ/"):
original_dir = os.getcwd() original_dir = os.getcwd()
@ -418,6 +435,7 @@ def run_server(
break # Exit the loop if the subprocess succeeds break # Exit the loop if the subprocess succeeds
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Error: {e}") print(f"Error: {e}")
time.sleep(random.randrange(start=1, stop=5))
finally: finally:
os.chdir(original_dir) os.chdir(original_dir)
else: else:
@ -428,9 +446,9 @@ def run_server(
port = random.randint(1024, 49152) port = random.randint(1024, 49152)
from litellm.proxy.proxy_server import app from litellm.proxy.proxy_server import app
if os.name == "nt": if run_gunicorn == False:
uvicorn.run(app, host=host, port=port) # run uvicorn uvicorn.run(app, host=host, port=port) # run uvicorn
else: elif run_gunicorn == True:
import gunicorn.app.base import gunicorn.app.base
# Gunicorn Application Class # Gunicorn Application Class

View file

@ -11,6 +11,12 @@ model_list:
output_cost_per_token: 0.00003 output_cost_per_token: 0.00003
max_tokens: 4096 max_tokens: 4096
base_model: gpt-3.5-turbo base_model: gpt-3.5-turbo
- model_name: gpt-4
litellm_params:
model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: gpt-vision - model_name: gpt-vision
litellm_params: litellm_params:
model: azure/gpt-4-vision model: azure/gpt-4-vision
@ -25,6 +31,9 @@ model_list:
- model_name: BEDROCK_GROUP - model_name: BEDROCK_GROUP
litellm_params: litellm_params:
model: bedrock/cohere.command-text-v14 model: bedrock/cohere.command-text-v14
- model_name: tg-ai
litellm_params:
model: together_ai/mistralai/Mistral-7B-Instruct-v0.1
- model_name: sagemaker - model_name: sagemaker
litellm_params: litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4 model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
@ -57,12 +66,21 @@ model_list:
mode: embedding mode: embedding
litellm_settings: litellm_settings:
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}] fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
success_callback: ['langfuse']
max_budget: 10 # global budget for proxy
budget_duration: 30d # global budget duration, will reset after 30d
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: None
# cache: True # cache: True
# setting callback class # setting callback class
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
# general_settings: general_settings:
# master_key: sk-1234 master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
# database_type: "dynamo_db" # database_type: "dynamo_db"
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190 # database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
# "billing_mode": "PAY_PER_REQUEST", # "billing_mode": "PAY_PER_REQUEST",

File diff suppressed because it is too large Load diff

View file

@ -7,28 +7,64 @@ generator client {
provider = "prisma-client-py" provider = "prisma-client-py"
} }
// Track spend, rate limit, budget Users
model LiteLLM_UserTable { model LiteLLM_UserTable {
user_id String @unique user_id String @unique
team_id String?
max_budget Float? max_budget Float?
spend Float @default(0.0) spend Float @default(0.0)
user_email String? user_email String?
models String[] @default([]) models String[]
max_parallel_requests Int?
tpm_limit BigInt?
rpm_limit BigInt?
budget_duration String?
budget_reset_at DateTime?
allowed_cache_controls String[] @default([])
} }
// required for token gen // Generate Tokens for Proxy
model LiteLLM_VerificationToken { model LiteLLM_VerificationToken {
token String @unique token String @unique
key_name String?
key_alias String?
spend Float @default(0.0) spend Float @default(0.0)
expires DateTime? expires DateTime?
models String[] @default([]) models String[]
aliases Json @default("{}") aliases Json @default("{}")
config Json @default("{}") config Json @default("{}")
user_id String? user_id String?
team_id String?
max_parallel_requests Int? max_parallel_requests Int?
metadata Json @default("{}") metadata Json @default("{}")
tpm_limit BigInt?
rpm_limit BigInt?
max_budget Float?
budget_duration String?
budget_reset_at DateTime?
allowed_cache_controls String[] @default([])
} }
// store proxy config.yaml
model LiteLLM_Config { model LiteLLM_Config {
param_name String @id param_name String @id
param_value Json? param_value Json?
}
// View spend, model, api_key per request
model LiteLLM_SpendLogs {
request_id String @unique
call_type String
api_key String @default ("")
spend Float @default(0.0)
total_tokens Int @default(0)
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
model String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")
cache_key String @default("")
} }

View file

@ -11,12 +11,10 @@ async def litellm_completion():
# Your existing code for litellm_completion goes here # Your existing code for litellm_completion goes here
try: try:
response = await litellm_client.chat.completions.create( response = await litellm_client.chat.completions.create(
model="Azure OpenAI GPT-4 Canada-East (External)", model="azure-gpt-3.5",
stream=True,
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
) )
async for chunk in response: print(response)
print(chunk)
return response return response
except Exception as e: except Exception as e:
@ -27,9 +25,9 @@ async def litellm_completion():
async def main(): async def main():
for i in range(1000000): for i in range(150):
start = time.time() start = time.time()
n = 1000 # Number of concurrent tasks n = 150 # Number of concurrent tasks
tasks = [litellm_completion() for _ in range(n)] tasks = [litellm_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks) chat_completions = await asyncio.gather(*tasks)

View file

@ -4,22 +4,28 @@ const openai = require('openai');
process.env.DEBUG=false; process.env.DEBUG=false;
async function runOpenAI() { async function runOpenAI() {
const client = new openai.OpenAI({ const client = new openai.OpenAI({
apiKey: 'your_api_key_here', apiKey: 'sk-JkKeNi6WpWDngBsghJ6B9g',
baseURL: 'http://0.0.0.0:8000' baseURL: 'http://0.0.0.0:8000'
}); });
try { try {
const response = await client.chat.completions.create({ const response = await client.chat.completions.create({
model: 'azure-gpt-3.5', model: 'sagemaker',
stream: true,
max_tokens: 1000,
messages: [ messages: [
{ {
role: 'user', role: 'user',
content: 'this is a test request, write a short poem'.repeat(2000), content: 'write a 20 pg essay about YC ',
}, },
], ],
}); });
console.log(response); console.log(response);
for await (const chunk of response) {
console.log(chunk);
console.log(chunk.choices[0].delta.content);
}
} catch (error) { } catch (error) {
console.log("got this exception from server"); console.log("got this exception from server");
console.error(error); console.error(error);

View file

@ -1,21 +1,28 @@
from typing import Optional, List, Any, Literal, Union from typing import Optional, List, Any, Literal, Union
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx
import litellm, backoff import litellm, backoff
from litellm.proxy._types import UserAPIKeyAuth, DynamoDBArgs, LiteLLM_VerificationToken from litellm.proxy._types import (
UserAPIKeyAuth,
DynamoDBArgs,
LiteLLM_VerificationToken,
LiteLLM_SpendLogs,
)
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter
from litellm.proxy.hooks.cache_control_check import CacheControlCheck
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy.db.base_client import CustomDB from litellm.proxy.db.base_client import CustomDB
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from fastapi import HTTPException, status from fastapi import HTTPException, status
import smtplib import smtplib, re
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from datetime import datetime from datetime import datetime, timedelta
def print_verbose(print_statement): def print_verbose(print_statement):
verbose_proxy_logger.debug(print_statement)
if litellm.set_verbose: if litellm.set_verbose:
print(f"LiteLLM Proxy: {print_statement}") # noqa print(f"LiteLLM Proxy: {print_statement}") # noqa
@ -36,6 +43,7 @@ class ProxyLogging:
self.call_details["user_api_key_cache"] = user_api_key_cache self.call_details["user_api_key_cache"] = user_api_key_cache
self.max_parallel_request_limiter = MaxParallelRequestsHandler() self.max_parallel_request_limiter = MaxParallelRequestsHandler()
self.max_budget_limiter = MaxBudgetLimiter() self.max_budget_limiter = MaxBudgetLimiter()
self.cache_control_check = CacheControlCheck()
self.alerting: Optional[List] = None self.alerting: Optional[List] = None
self.alerting_threshold: float = 300 # default to 5 min. threshold self.alerting_threshold: float = 300 # default to 5 min. threshold
pass pass
@ -51,6 +59,7 @@ class ProxyLogging:
print_verbose(f"INITIALIZING LITELLM CALLBACKS!") print_verbose(f"INITIALIZING LITELLM CALLBACKS!")
litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_parallel_request_limiter)
litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.max_budget_limiter)
litellm.callbacks.append(self.cache_control_check)
for callback in litellm.callbacks: for callback in litellm.callbacks:
if callback not in litellm.input_callback: if callback not in litellm.input_callback:
litellm.input_callback.append(callback) litellm.input_callback.append(callback)
@ -91,8 +100,9 @@ class ProxyLogging:
2. /embeddings 2. /embeddings
3. /image/generation 3. /image/generation
""" """
print_verbose(f"Inside Proxy Logging Pre-call hook!")
### ALERTING ### ### ALERTING ###
asyncio.create_task(self.response_taking_too_long()) asyncio.create_task(self.response_taking_too_long(request_data=data))
try: try:
for callback in litellm.callbacks: for callback in litellm.callbacks:
@ -132,27 +142,113 @@ class ProxyLogging:
start_time: Optional[float] = None, start_time: Optional[float] = None,
end_time: Optional[float] = None, end_time: Optional[float] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request", type: Literal["hanging_request", "slow_response"] = "hanging_request",
request_data: Optional[dict] = None,
): ):
if request_data is not None:
model = request_data.get("model", "")
messages = request_data.get("messages", "")
# try casting messages to str and get the first 100 characters, else mark as None
try:
messages = str(messages)
messages = messages[:10000]
except:
messages = None
request_info = f"\nRequest Model: {model}\nMessages: {messages}"
else:
request_info = ""
if type == "hanging_request": if type == "hanging_request":
# Simulate a long-running operation that could take more than 5 minutes # Simulate a long-running operation that could take more than 5 minutes
await asyncio.sleep( await asyncio.sleep(
self.alerting_threshold self.alerting_threshold
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
if (
await self.alerting_handler( request_data is not None
message=f"Requests are hanging - {self.alerting_threshold}s+ request time", and request_data.get("litellm_status", "") != "success"
level="Medium", ):
) # only alert hanging responses if they have not been marked as success
alerting_message = (
f"Requests are hanging - {self.alerting_threshold}s+ request time"
)
await self.alerting_handler(
message=alerting_message + request_info,
level="Medium",
)
elif ( elif (
type == "slow_response" and start_time is not None and end_time is not None type == "slow_response" and start_time is not None and end_time is not None
): ):
slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s"
if end_time - start_time > self.alerting_threshold: if end_time - start_time > self.alerting_threshold:
await self.alerting_handler( await self.alerting_handler(
message=f"Responses are slow - {round(end_time-start_time,2)}s response time", message=slow_message + request_info,
level="Low", level="Low",
) )
async def budget_alerts(
self,
type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
user_max_budget: float,
user_current_spend: float,
user_info=None,
):
if self.alerting is None:
# do nothing if alerting is not switched on
return
if type == "user_and_proxy_budget":
user_info = dict(user_info)
user_id = user_info["user_id"]
max_budget = user_info["max_budget"]
spend = user_info["spend"]
user_email = user_info["user_email"]
user_info = f"""\nUser ID: {user_id}\nMax Budget: ${max_budget}\nSpend: ${spend}\nUser Email: {user_email}"""
elif type == "token_budget":
token_info = dict(user_info)
token = token_info["token"]
spend = token_info["spend"]
max_budget = token_info["max_budget"]
user_id = token_info["user_id"]
user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}"""
else:
user_info = str(user_info)
# percent of max_budget left to spend
percent_left = (user_max_budget - user_current_spend) / user_max_budget
verbose_proxy_logger.debug(
f"Budget Alerts: Percent left: {percent_left} for {user_info}"
)
# check if crossed budget
if user_current_spend >= user_max_budget:
verbose_proxy_logger.debug(f"Budget Crossed for {user_info}")
message = "Budget Crossed for" + user_info
await self.alerting_handler(
message=message,
level="High",
)
return
# check if 5% of max budget is left
if percent_left <= 0.05:
message = "5% budget left for" + user_info
await self.alerting_handler(
message=message,
level="Medium",
)
return
# check if 15% of max budget is left
if percent_left <= 0.15:
message = "15% budget left for" + user_info
await self.alerting_handler(
message=message,
level="Low",
)
return
return
async def alerting_handler( async def alerting_handler(
self, message: str, level: Literal["Low", "Medium", "High"] self, message: str, level: Literal["Low", "Medium", "High"]
): ):
@ -163,12 +259,20 @@ class ProxyLogging:
- Requests are hanging - Requests are hanging
- Calls are failing - Calls are failing
- DB Read/Writes are failing - DB Read/Writes are failing
- Proxy Close to max budget
- Key Close to max budget
Parameters: Parameters:
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
message: str - what is the alert about message: str - what is the alert about
""" """
formatted_message = f"Level: {level}\n\nMessage: {message}" from datetime import datetime
# Get the current timestamp
current_time = datetime.now().strftime("%H:%M:%S")
formatted_message = (
f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}"
)
if self.alerting is None: if self.alerting is None:
return return
@ -179,7 +283,9 @@ class ProxyLogging:
raise Exception("Missing SLACK_WEBHOOK_URL from environment") raise Exception("Missing SLACK_WEBHOOK_URL from environment")
payload = {"text": formatted_message} payload = {"text": formatted_message}
headers = {"Content-type": "application/json"} headers = {"Content-type": "application/json"}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False)
) as session:
async with session.post( async with session.post(
slack_webhook_url, json=payload, headers=headers slack_webhook_url, json=payload, headers=headers
) as response: ) as response:
@ -316,7 +422,7 @@ class PrismaClient:
self, self,
key: str, key: str,
value: Any, value: Any,
table_name: Literal["users", "keys", "config"], table_name: Literal["users", "keys", "config", "spend"],
): ):
""" """
Generic implementation of get data Generic implementation of get data
@ -334,6 +440,10 @@ class PrismaClient:
response = await self.db.litellm_config.find_first( # type: ignore response = await self.db.litellm_config.find_first( # type: ignore
where={key: value} # type: ignore where={key: value} # type: ignore
) )
elif table_name == "spend":
response = await self.db.l.find_first( # type: ignore
where={key: value} # type: ignore
)
return response return response
except Exception as e: except Exception as e:
asyncio.create_task( asyncio.create_task(
@ -352,8 +462,12 @@ class PrismaClient:
self, self,
token: Optional[str] = None, token: Optional[str] = None,
user_id: Optional[str] = None, user_id: Optional[str] = None,
table_name: Optional[Literal["user", "key", "config"]] = None, user_id_list: Optional[list] = None,
key_val: Optional[dict] = None,
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
query_type: Literal["find_unique", "find_all"] = "find_unique", query_type: Literal["find_unique", "find_all"] = "find_unique",
expires: Optional[datetime] = None,
reset_at: Optional[datetime] = None,
): ):
try: try:
print_verbose("PrismaClient: get_data") print_verbose("PrismaClient: get_data")
@ -365,14 +479,18 @@ class PrismaClient:
hashed_token = token hashed_token = token
if token.startswith("sk-"): if token.startswith("sk-"):
hashed_token = self.hash_token(token=token) hashed_token = self.hash_token(token=token)
print_verbose("PrismaClient: find_unique") verbose_proxy_logger.debug(
f"PrismaClient: find_unique for token: {hashed_token}"
)
if query_type == "find_unique": if query_type == "find_unique":
response = await self.db.litellm_verificationtoken.find_unique( response = await self.db.litellm_verificationtoken.find_unique(
where={"token": hashed_token} where={"token": hashed_token}
) )
if response is not None: if response is not None:
# for prisma we need to cast the expires time to str # for prisma we need to cast the expires time to str
if isinstance(response.expires, datetime): if response.expires is not None and isinstance(
response.expires, datetime
):
response.expires = response.expires.isoformat() response.expires = response.expires.isoformat()
elif query_type == "find_all" and user_id is not None: elif query_type == "find_all" and user_id is not None:
response = await self.db.litellm_verificationtoken.find_many( response = await self.db.litellm_verificationtoken.find_many(
@ -382,6 +500,28 @@ class PrismaClient:
for r in response: for r in response:
if isinstance(r.expires, datetime): if isinstance(r.expires, datetime):
r.expires = r.expires.isoformat() r.expires = r.expires.isoformat()
elif (
query_type == "find_all"
and expires is not None
and reset_at is not None
):
response = await self.db.litellm_verificationtoken.find_many(
where={ # type:ignore
"OR": [
{"expires": None},
{"expires": {"gt": expires}},
],
"budget_reset_at": {"lt": reset_at},
}
)
if response is not None and len(response) > 0:
for r in response:
if isinstance(r.expires, datetime):
r.expires = r.expires.isoformat()
elif query_type == "find_all":
response = await self.db.litellm_verificationtoken.find_many(
order={"spend": "desc"},
)
print_verbose(f"PrismaClient: response={response}") print_verbose(f"PrismaClient: response={response}")
if response is not None: if response is not None:
return response return response
@ -391,13 +531,61 @@ class PrismaClient:
status_code=status.HTTP_401_UNAUTHORIZED, status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication Error: invalid user key - token does not exist", detail="Authentication Error: invalid user key - token does not exist",
) )
elif user_id is not None: elif user_id is not None or (
response = await self.db.litellm_usertable.find_unique( # type: ignore table_name is not None and table_name == "user"
where={ ):
"user_id": user_id, if query_type == "find_unique":
} response = await self.db.litellm_usertable.find_unique( # type: ignore
) where={
"user_id": user_id, # type: ignore
}
)
elif query_type == "find_all" and reset_at is not None:
response = await self.db.litellm_usertable.find_many(
where={ # type:ignore
"budget_reset_at": {"lt": reset_at},
}
)
elif query_type == "find_all" and user_id_list is not None:
user_id_values = str(tuple(user_id_list))
sql_query = f"""
SELECT *
FROM "LiteLLM_UserTable"
WHERE "user_id" IN {user_id_values}
"""
# Execute the raw query
# The asterisk before `user_id_list` unpacks the list into separate arguments
response = await self.db.query_raw(sql_query)
elif query_type == "find_all":
response = await self.db.litellm_usertable.find_many( # type: ignore
order={"spend": "desc"},
)
return response return response
elif table_name == "spend":
verbose_proxy_logger.debug(
f"PrismaClient: get_data: table_name == 'spend'"
)
if key_val is not None:
if query_type == "find_unique":
response = await self.db.litellm_spendlogs.find_unique( # type: ignore
where={ # type: ignore
key_val["key"]: key_val["value"], # type: ignore
}
)
elif query_type == "find_all":
response = await self.db.litellm_spendlogs.find_many( # type: ignore
where={
key_val["key"]: key_val["value"], # type: ignore
}
)
return response
else:
response = await self.db.litellm_spendlogs.find_many( # type: ignore
order={"startTime": "desc"},
)
return response
except Exception as e: except Exception as e:
print_verbose(f"LiteLLM Prisma Client Exception: {e}") print_verbose(f"LiteLLM Prisma Client Exception: {e}")
import traceback import traceback
@ -417,7 +605,7 @@ class PrismaClient:
on_backoff=on_backoff, # specifying the function to call on backoff on_backoff=on_backoff, # specifying the function to call on backoff
) )
async def insert_data( async def insert_data(
self, data: dict, table_name: Literal["user", "key", "config"] self, data: dict, table_name: Literal["user", "key", "config", "spend"]
): ):
""" """
Add a key to the database. If it already exists, do nothing. Add a key to the database. If it already exists, do nothing.
@ -440,6 +628,7 @@ class PrismaClient:
"update": {}, # don't do anything if it already exists "update": {}, # don't do anything if it already exists
}, },
) )
verbose_proxy_logger.info(f"Data Inserted into Keys Table")
return new_verification_token return new_verification_token
elif table_name == "user": elif table_name == "user":
db_data = self.jsonify_object(data=data) db_data = self.jsonify_object(data=data)
@ -450,6 +639,7 @@ class PrismaClient:
"update": {}, # don't do anything if it already exists "update": {}, # don't do anything if it already exists
}, },
) )
verbose_proxy_logger.info(f"Data Inserted into User Table")
return new_user_row return new_user_row
elif table_name == "config": elif table_name == "config":
""" """
@ -473,8 +663,20 @@ class PrismaClient:
) )
tasks.append(updated_table_row) tasks.append(updated_table_row)
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
verbose_proxy_logger.info(f"Data Inserted into Config Table")
elif table_name == "spend":
db_data = self.jsonify_object(data=data)
new_spend_row = await self.db.litellm_spendlogs.upsert(
where={"request_id": data["request_id"]},
data={
"create": {**db_data}, # type: ignore
"update": {}, # don't do anything if it already exists
},
)
verbose_proxy_logger.info(f"Data Inserted into Spend Table")
return new_spend_row
except Exception as e: except Exception as e:
print_verbose(f"LiteLLM Prisma Client Exception: {e}") print_verbose(f"LiteLLM Prisma Client Exception: {e}")
asyncio.create_task( asyncio.create_task(
@ -494,7 +696,11 @@ class PrismaClient:
self, self,
token: Optional[str] = None, token: Optional[str] = None,
data: dict = {}, data: dict = {},
data_list: Optional[List] = None,
user_id: Optional[str] = None, user_id: Optional[str] = None,
query_type: Literal["update", "update_many"] = "update",
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
update_key_values: Optional[dict] = None,
): ):
""" """
Update existing data Update existing data
@ -511,17 +717,95 @@ class PrismaClient:
where={"token": token}, # type: ignore where={"token": token}, # type: ignore
data={**db_data}, # type: ignore data={**db_data}, # type: ignore
) )
print_verbose("\033[91m" + f"DB write succeeded {response}" + "\033[0m") verbose_proxy_logger.debug(
"\033[91m"
+ f"DB Token Table update succeeded {response}"
+ "\033[0m"
)
return {"token": token, "data": db_data} return {"token": token, "data": db_data}
elif user_id is not None: elif (
user_id is not None
or (table_name is not None and table_name == "user")
and query_type == "update"
):
""" """
If data['spend'] + data['user'], update the user table with spend info as well If data['spend'] + data['user'], update the user table with spend info as well
""" """
update_user_row = await self.db.litellm_usertable.update( if user_id is None:
user_id = db_data["user_id"]
if update_key_values is None:
update_key_values = db_data
update_user_row = await self.db.litellm_usertable.upsert(
where={"user_id": user_id}, # type: ignore where={"user_id": user_id}, # type: ignore
data={**db_data}, # type: ignore data={
"create": {**db_data}, # type: ignore
"update": {
**update_key_values # type: ignore
}, # just update user-specified values, if it already exists
},
)
verbose_proxy_logger.info(
"\033[91m"
+ f"DB User Table - update succeeded {update_user_row}"
+ "\033[0m"
) )
return {"user_id": user_id, "data": db_data} return {"user_id": user_id, "data": db_data}
elif (
table_name is not None
and table_name == "key"
and query_type == "update_many"
and data_list is not None
and isinstance(data_list, list)
):
"""
Batch write update queries
"""
batcher = self.db.batch_()
for idx, t in enumerate(data_list):
# check if plain text or hash
if t.token.startswith("sk-"): # type: ignore
t.token = self.hash_token(token=t.token) # type: ignore
try:
data_json = self.jsonify_object(data=t.model_dump())
except:
data_json = self.jsonify_object(data=t.dict())
batcher.litellm_verificationtoken.update(
where={"token": t.token}, # type: ignore
data={**data_json}, # type: ignore
)
await batcher.commit()
print_verbose(
"\033[91m" + f"DB Token Table update succeeded" + "\033[0m"
)
elif (
table_name is not None
and table_name == "user"
and query_type == "update_many"
and data_list is not None
and isinstance(data_list, list)
):
"""
Batch write update queries
"""
batcher = self.db.batch_()
for idx, user in enumerate(data_list):
try:
data_json = self.jsonify_object(data=user.model_dump())
except:
data_json = self.jsonify_object(data=user.dict())
batcher.litellm_usertable.upsert(
where={"user_id": user.user_id}, # type: ignore
data={
"create": {**data_json}, # type: ignore
"update": {
**data_json # type: ignore
}, # just update user-specified values, if it already exists
},
)
await batcher.commit()
verbose_proxy_logger.info(
"\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
)
except Exception as e: except Exception as e:
asyncio.create_task( asyncio.create_task(
self.proxy_logging_obj.failure_handler(original_exception=e) self.proxy_logging_obj.failure_handler(original_exception=e)
@ -542,7 +826,13 @@ class PrismaClient:
Allow user to delete a key(s) Allow user to delete a key(s)
""" """
try: try:
hashed_tokens = [self.hash_token(token=token) for token in tokens] hashed_tokens = []
for token in tokens:
if isinstance(token, str) and token.startswith("sk-"):
hashed_token = self.hash_token(token=token)
else:
hashed_token = token
hashed_tokens.append(hashed_token)
await self.db.litellm_verificationtoken.delete_many( await self.db.litellm_verificationtoken.delete_many(
where={"token": {"in": hashed_tokens}} where={"token": {"in": hashed_tokens}}
) )
@ -750,7 +1040,8 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
print_verbose(f"SMTP Connection Init") print_verbose(f"SMTP Connection Init")
# Establish a secure connection with the SMTP server # Establish a secure connection with the SMTP server
with smtplib.SMTP(smtp_host, smtp_port) as server: with smtplib.SMTP(smtp_host, smtp_port) as server:
server.starttls() if os.getenv("SMTP_TLS", "True") != "False":
server.starttls()
# Login to your email account # Login to your email account
server.login(smtp_username, smtp_password) server.login(smtp_username, smtp_password)
@ -759,4 +1050,228 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
server.send_message(email_message) server.send_message(email_message)
except Exception as e: except Exception as e:
print_verbose("An error occurred while sending the email:", str(e)) print_verbose("An error occurred while sending the email:" + str(e))
def hash_token(token: str):
import hashlib
# Hash the string using SHA-256
hashed_token = hashlib.sha256(token.encode()).hexdigest()
return hashed_token
def get_logging_payload(kwargs, response_obj, start_time, end_time):
from litellm.proxy._types import LiteLLM_SpendLogs
from pydantic import Json
import uuid
verbose_proxy_logger.debug(
f"SpendTable: get_logging_payload - kwargs: {kwargs}\n\n"
)
if kwargs == None:
kwargs = {}
# standardize this function to be used across, s3, dynamoDB, langfuse logging
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
call_type = kwargs.get("call_type", "litellm.completion")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj["usage"]
if type(usage) == litellm.Usage:
usage = dict(usage)
id = response_obj.get("id", str(uuid.uuid4()))
api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
# hash the api_key
api_key = hash_token(api_key)
if "headers" in metadata and "authorization" in metadata["headers"]:
metadata["headers"].pop(
"authorization"
) # do not store the original `sk-..` api key in the db
if litellm.cache is not None:
cache_key = litellm.cache.get_cache_key(**kwargs)
else:
cache_key = "Cache OFF"
if cache_hit == True:
import time
id = f"{id}_cache_hit{time.time()}" # SpendLogs does not allow duplicate request_id
payload = {
"request_id": id,
"call_type": call_type,
"api_key": api_key,
"cache_hit": cache_hit,
"startTime": start_time,
"endTime": end_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"metadata": metadata,
"cache_key": cache_key,
"total_tokens": usage.get("total_tokens", 0),
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
}
json_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == Json or field_type == Optional[Json]
]
str_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == str or field_type == Optional[str]
]
datetime_fields = [
field
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
if field_type == datetime
]
for param in json_fields:
if param in payload and type(payload[param]) != Json:
if type(payload[param]) == litellm.ModelResponse:
payload[param] = payload[param].model_dump_json()
if type(payload[param]) == litellm.EmbeddingResponse:
payload[param] = payload[param].model_dump_json()
else:
payload[param] = json.dumps(payload[param])
for param in str_fields:
if param in payload and type(payload[param]) != str:
payload[param] = str(payload[param])
return payload
def _duration_in_seconds(duration: str):
match = re.match(r"(\d+)([smhd]?)", duration)
if not match:
raise ValueError("Invalid duration format")
value, unit = match.groups()
value = int(value)
if unit == "s":
return value
elif unit == "m":
return value * 60
elif unit == "h":
return value * 3600
elif unit == "d":
return value * 86400
else:
raise ValueError("Unsupported duration unit")
async def reset_budget(prisma_client: PrismaClient):
"""
Gets all the non-expired keys for a db, which need spend to be reset
Resets their spend
Updates db
"""
if prisma_client is not None:
### RESET KEY BUDGET ###
now = datetime.utcnow()
keys_to_reset = await prisma_client.get_data(
table_name="key", query_type="find_all", expires=now, reset_at=now
)
if keys_to_reset is not None and len(keys_to_reset) > 0:
for key in keys_to_reset:
key.spend = 0.0
duration_s = _duration_in_seconds(duration=key.budget_duration)
key.budget_reset_at = now + timedelta(seconds=duration_s)
await prisma_client.update_data(
query_type="update_many", data_list=keys_to_reset, table_name="key"
)
### RESET USER BUDGET ###
now = datetime.utcnow()
users_to_reset = await prisma_client.get_data(
table_name="user", query_type="find_all", reset_at=now
)
verbose_proxy_logger.debug(f"users_to_reset from get_data: {users_to_reset}")
if users_to_reset is not None and len(users_to_reset) > 0:
for user in users_to_reset:
user.spend = 0.0
duration_s = _duration_in_seconds(duration=user.budget_duration)
user.budget_reset_at = now + timedelta(seconds=duration_s)
await prisma_client.update_data(
query_type="update_many", data_list=users_to_reset, table_name="user"
)
# LiteLLM Admin UI - Non SSO Login
html_form = """
<!DOCTYPE html>
<html>
<head>
<title>LiteLLM Login</title>
<style>
body {
font-family: Arial, sans-serif;
background-color: #f4f4f4;
margin: 0;
padding: 0;
display: flex;
justify-content: center;
align-items: center;
height: 100vh;
}
form {
background-color: #fff;
padding: 20px;
border-radius: 8px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
label {
display: block;
margin-bottom: 8px;
}
input {
width: 100%;
padding: 8px;
margin-bottom: 16px;
box-sizing: border-box;
border: 1px solid #ccc;
border-radius: 4px;
}
input[type="submit"] {
background-color: #4caf50;
color: #fff;
cursor: pointer;
}
input[type="submit"]:hover {
background-color: #45a049;
}
</style>
</head>
<body>
<form action="/login" method="post">
<h2>LiteLLM Login</h2>
<label for="username">Username:</label>
<input type="text" id="username" name="username" required>
<label for="password">Password:</label>
<input type="password" id="password" name="password" required>
<input type="submit" value="Submit">
</form>
</body>
</html>
"""

View file

@ -94,11 +94,15 @@ class Router:
timeout: Optional[float] = None, timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create default_litellm_params={}, # default params for Router.chat.completion.create
set_verbose: bool = False, set_verbose: bool = False,
debug_level: Literal["DEBUG", "INFO"] = "INFO",
fallbacks: List = [], fallbacks: List = [],
allowed_fails: Optional[int] = None,
context_window_fallbacks: List = [], context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {}, model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request retry_after: int = 0, # min time to wait before retrying a failed request
allowed_fails: Optional[
int
] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
routing_strategy: Literal[ routing_strategy: Literal[
"simple-shuffle", "simple-shuffle",
"least-busy", "least-busy",
@ -107,7 +111,42 @@ class Router:
] = "simple-shuffle", ] = "simple-shuffle",
routing_strategy_args: dict = {}, # just for latency-based routing routing_strategy_args: dict = {}, # just for latency-based routing
) -> None: ) -> None:
"""
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
Args:
model_list (Optional[list]): List of models to be used. Defaults to None.
redis_url (Optional[str]): URL of the Redis server. Defaults to None.
redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
redis_port (Optional[int]): Port of the Redis server. Defaults to None.
redis_password (Optional[str]): Password of the Redis server. Defaults to None.
cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
num_retries (int): Number of retries for failed requests. Defaults to 0.
timeout (Optional[float]): Timeout for requests. Defaults to None.
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
set_verbose (bool): Flag to set verbose mode. Defaults to False.
debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
fallbacks (List): List of fallback options. Defaults to [].
context_window_fallbacks (List): List of context window fallback options. Defaults to [].
model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
Returns:
Router: An instance of the litellm.Router class.
"""
self.set_verbose = set_verbose self.set_verbose = set_verbose
if self.set_verbose:
if debug_level == "INFO":
verbose_router_logger.setLevel(logging.INFO)
elif debug_level == "DEBUG":
verbose_router_logger.setLevel(logging.DEBUG)
self.deployment_names: List = ( self.deployment_names: List = (
[] []
) # names of models under litellm_params. ex. azure/chatgpt-v-2 ) # names of models under litellm_params. ex. azure/chatgpt-v-2
@ -157,6 +196,7 @@ class Router:
self.deployment_latency_map[m["litellm_params"]["model"]] = 0 self.deployment_latency_map[m["litellm_params"]["model"]] = 0
self.allowed_fails = allowed_fails or litellm.allowed_fails self.allowed_fails = allowed_fails or litellm.allowed_fails
self.cooldown_time = cooldown_time or 1
self.failed_calls = ( self.failed_calls = (
InMemoryCache() InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown ) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -249,16 +289,13 @@ class Router:
timeout = kwargs.get("request_timeout", self.timeout) timeout = kwargs.get("request_timeout", self.timeout)
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries) kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
kwargs.setdefault("metadata", {}).update({"model_group": model}) kwargs.setdefault("metadata", {}).update({"model_group": model})
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: response = self.function_with_fallbacks(**kwargs)
# Submit the function to the executor with a timeout
future = executor.submit(self.function_with_fallbacks, **kwargs)
response = future.result(timeout=timeout) # type: ignore
return response return response
except Exception as e: except Exception as e:
raise e raise e
def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs): def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs):
model_name = None
try: try:
# pick the one that is available (lowest TPM/RPM) # pick the one that is available (lowest TPM/RPM)
deployment = self.get_available_deployment( deployment = self.get_available_deployment(
@ -271,6 +308,7 @@ class Router:
) )
data = deployment["litellm_params"].copy() data = deployment["litellm_params"].copy()
kwargs["model_info"] = deployment.get("model_info", {}) kwargs["model_info"] = deployment.get("model_info", {})
model_name = data["model"]
for k, v in self.default_litellm_params.items(): for k, v in self.default_litellm_params.items():
if ( if (
k not in kwargs k not in kwargs
@ -292,7 +330,7 @@ class Router:
else: else:
model_client = potential_model_client model_client = potential_model_client
return litellm.completion( response = litellm.completion(
**{ **{
**data, **data,
"messages": messages, "messages": messages,
@ -301,7 +339,14 @@ class Router:
**kwargs, **kwargs,
} }
) )
verbose_router_logger.info(
f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
)
return response
except Exception as e: except Exception as e:
verbose_router_logger.info(
f"litellm.completion(model={model_name})\033[31m Exception {str(e)}\033[0m"
)
raise e raise e
async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs): async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
@ -830,6 +875,9 @@ class Router:
""" """
try: try:
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = await self.async_function_with_retries( response = await self.async_function_with_retries(
*args, **kwargs *args, **kwargs
) )
@ -858,8 +906,10 @@ class Router:
f"Falling back to model_group = {mg}" f"Falling back to model_group = {mg}"
) )
kwargs["model"] = mg kwargs["model"] = mg
kwargs["metadata"]["model_group"] = mg kwargs.setdefault("metadata", {}).update(
response = await self.async_function_with_retries( {"model_group": mg}
) # update model_group used, if fallbacks are done
response = await self.async_function_with_fallbacks(
*args, **kwargs *args, **kwargs
) )
return response return response
@ -1024,6 +1074,9 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = self.function_with_fallbacks(*args, **kwargs) response = self.function_with_fallbacks(*args, **kwargs)
return response return response
except Exception as e: except Exception as e:
@ -1047,6 +1100,9 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
kwargs["model"] = mg kwargs["model"] = mg
kwargs.setdefault("metadata", {}).update(
{"model_group": mg}
) # update model_group used, if fallbacks are done
response = self.function_with_fallbacks(*args, **kwargs) response = self.function_with_fallbacks(*args, **kwargs)
return response return response
except Exception as e: except Exception as e:
@ -1232,6 +1288,7 @@ class Router:
verbose_router_logger.debug( verbose_router_logger.debug(
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
) )
cooldown_time = self.cooldown_time or 1
if updated_fails > self.allowed_fails: if updated_fails > self.allowed_fails:
# get the current cooldown list for that minute # get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
@ -1245,13 +1302,19 @@ class Router:
else: else:
cached_value = cached_value + [deployment] cached_value = cached_value + [deployment]
# save updated value # save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1) self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
except: except:
cached_value = [deployment] cached_value = [deployment]
# save updated value # save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1) self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
else: else:
self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1) self.failed_calls.set_cache(
key=deployment, value=updated_fails, ttl=cooldown_time
)
def _get_cooldown_deployments(self): def _get_cooldown_deployments(self):
""" """
@ -1344,6 +1407,7 @@ class Router:
max_retries = litellm.get_secret(max_retries_env_name) max_retries = litellm.get_secret(max_retries_env_name)
litellm_params["max_retries"] = max_retries litellm_params["max_retries"] = max_retries
# proxy support # proxy support
import os import os
import httpx import httpx
@ -1369,6 +1433,12 @@ class Router:
), ),
} }
organization = litellm_params.get("organization", None)
if isinstance(organization, str) and organization.startswith("os.environ/"):
organization_env_name = organization.replace("os.environ/", "")
organization = litellm.get_secret(organization_env_name)
litellm_params["organization"] = organization
if "azure" in model_name: if "azure" in model_name:
if api_base is None: if api_base is None:
raise ValueError( raise ValueError(
@ -1576,6 +1646,7 @@ class Router:
base_url=api_base, base_url=api_base,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(),
limits=httpx.Limits( limits=httpx.Limits(
@ -1597,6 +1668,7 @@ class Router:
base_url=api_base, base_url=api_base,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(),
limits=httpx.Limits( limits=httpx.Limits(
@ -1619,6 +1691,7 @@ class Router:
base_url=api_base, base_url=api_base,
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(),
limits=httpx.Limits( limits=httpx.Limits(
@ -1641,6 +1714,7 @@ class Router:
base_url=api_base, base_url=api_base,
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
organization=organization,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(),
limits=httpx.Limits( limits=httpx.Limits(
@ -1865,6 +1939,9 @@ class Router:
selected_index = random.choices(range(len(rpms)), weights=weights)[0] selected_index = random.choices(range(len(rpms)), weights=weights)[0]
verbose_router_logger.debug(f"\n selected index, {selected_index}") verbose_router_logger.debug(f"\n selected index, {selected_index}")
deployment = healthy_deployments[selected_index] deployment = healthy_deployments[selected_index]
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
)
return deployment or deployment[0] return deployment or deployment[0]
############## Check if we can do a RPM/TPM based weighted pick ################# ############## Check if we can do a RPM/TPM based weighted pick #################
tpm = healthy_deployments[0].get("litellm_params").get("tpm", None) tpm = healthy_deployments[0].get("litellm_params").get("tpm", None)
@ -1879,6 +1956,9 @@ class Router:
selected_index = random.choices(range(len(tpms)), weights=weights)[0] selected_index = random.choices(range(len(tpms)), weights=weights)[0]
verbose_router_logger.debug(f"\n selected index, {selected_index}") verbose_router_logger.debug(f"\n selected index, {selected_index}")
deployment = healthy_deployments[selected_index] deployment = healthy_deployments[selected_index]
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
)
return deployment or deployment[0] return deployment or deployment[0]
############## No RPM/TPM passed, we do a random pick ################# ############## No RPM/TPM passed, we do a random pick #################
@ -1903,8 +1983,13 @@ class Router:
) )
if deployment is None: if deployment is None:
verbose_router_logger.info(
f"get_available_deployment for model: {model}, No deployment available"
)
raise ValueError("No models available.") raise ValueError("No models available.")
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {deployment} for model: {model}"
)
return deployment return deployment
def flush_cache(self): def flush_cache(self):

View file

@ -10,6 +10,7 @@ import traceback
from litellm import token_counter from litellm import token_counter
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_router_logger
class LowestTPMLoggingHandler(CustomLogger): class LowestTPMLoggingHandler(CustomLogger):
@ -130,6 +131,9 @@ class LowestTPMLoggingHandler(CustomLogger):
Returns a deployment with the lowest TPM/RPM usage. Returns a deployment with the lowest TPM/RPM usage.
""" """
# get list of potential deployments # get list of potential deployments
verbose_router_logger.debug(
f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
)
current_minute = datetime.now().strftime("%H-%M") current_minute = datetime.now().strftime("%H-%M")
tpm_key = f"{model_group}:tpm:{current_minute}" tpm_key = f"{model_group}:tpm:{current_minute}"
rpm_key = f"{model_group}:rpm:{current_minute}" rpm_key = f"{model_group}:rpm:{current_minute}"
@ -137,14 +141,31 @@ class LowestTPMLoggingHandler(CustomLogger):
tpm_dict = self.router_cache.get_cache(key=tpm_key) tpm_dict = self.router_cache.get_cache(key=tpm_key)
rpm_dict = self.router_cache.get_cache(key=rpm_key) rpm_dict = self.router_cache.get_cache(key=rpm_key)
verbose_router_logger.debug(
f"tpm_key={tpm_key}, tpm_dict: {tpm_dict}, rpm_dict: {rpm_dict}"
)
try:
input_tokens = token_counter(messages=messages, text=input)
except:
input_tokens = 0
# ----------------------- # -----------------------
# Find lowest used model # Find lowest used model
# ---------------------- # ----------------------
lowest_tpm = float("inf") lowest_tpm = float("inf")
deployment = None deployment = None
if tpm_dict is None: # base case if tpm_dict is None: # base case - none of the deployments have been used
item = random.choice(healthy_deployments) # Return the 1st deployment where deployment["tpm"] >= input_tokens
return item for deployment in healthy_deployments:
_deployment_tpm = (
deployment.get("tpm", None)
or deployment.get("litellm_params", {}).get("tpm", None)
or deployment.get("model_info", {}).get("tpm", None)
or float("inf")
)
if _deployment_tpm >= input_tokens:
return deployment
return None
all_deployments = tpm_dict all_deployments = tpm_dict
for d in healthy_deployments: for d in healthy_deployments:
@ -152,11 +173,6 @@ class LowestTPMLoggingHandler(CustomLogger):
if d["model_info"]["id"] not in all_deployments: if d["model_info"]["id"] not in all_deployments:
all_deployments[d["model_info"]["id"]] = 0 all_deployments[d["model_info"]["id"]] = 0
try:
input_tokens = token_counter(messages=messages, text=input)
except:
input_tokens = 0
for item, item_tpm in all_deployments.items(): for item, item_tpm in all_deployments.items():
## get the item from model list ## get the item from model list
_deployment = None _deployment = None

View file

@ -1,57 +0,0 @@
Starting new HTTPS connection (1): api.anthropic.com:443
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
https://api.anthropic.com:443 "POST /v1/complete HTTP/1.1" 200 None
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'this is a streaming test for llama2 + langfuse'}], 'model': 'gpt-3.5-turbo', 'max_tokens': 20, 'stream': True, 'temperature': 0.2}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f92d0>
start_tls.started ssl_context=<ssl.SSLContext object at 0x108ddf020> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f9290>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:00 GMT'), (b'Content-Type', b'text/event-stream'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-0613'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'62'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999967'), (b'x-ratelimit-remaining-tokens_usage_based', b'999967'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'dd1029a85edecb986fb662945c9f7b4f'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=dnuSnc6BPNJd4lgWKpv3iE2P5zy4r5aCVekXVi7HG7U-1703313180-1-AbeMpAfvmJ6BShULb7tMaErR5ergUrt6ohiXj1e8zoo9AotZ0Jz0alUSUcp8FXyQX2VQ9P6gBUeoSR9aE98OasU=; path=/; expires=Sat, 23-Dec-23 07:03:00 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=dET0GKSNfbtSWNJuXndP8GY8M0ANzDK4Dl7mvIfhmM0-1703313180257-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e920e4f47f4b0-BOM'), (b'alt-svc', b'h3=":443"; ma=86400')])
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': "What's the weather like in San Francisco, Tokyo, and Paris?"}], 'model': 'gpt-3.5-turbo-1106', 'tool_choice': 'auto', 'tools': [{'type': 'function', 'function': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
nction': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"

Some files were not shown because too many files have changed in this diff Show more