Merge branch 'main' into litellm_http_proxy_support
|
@ -42,6 +42,7 @@ jobs:
|
||||||
pip install "anyio==3.7.1"
|
pip install "anyio==3.7.1"
|
||||||
pip install "aiodynamo==23.10.1"
|
pip install "aiodynamo==23.10.1"
|
||||||
pip install "asyncio==3.4.3"
|
pip install "asyncio==3.4.3"
|
||||||
|
pip install "apscheduler==3.10.4"
|
||||||
pip install "PyGithub==1.59.1"
|
pip install "PyGithub==1.59.1"
|
||||||
- save_cache:
|
- save_cache:
|
||||||
paths:
|
paths:
|
||||||
|
@ -97,6 +98,43 @@ jobs:
|
||||||
command: |
|
command: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||||
|
- run:
|
||||||
|
name: Install Python 3.9
|
||||||
|
command: |
|
||||||
|
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
|
||||||
|
bash miniconda.sh -b -p $HOME/miniconda
|
||||||
|
export PATH="$HOME/miniconda/bin:$PATH"
|
||||||
|
conda init bash
|
||||||
|
source ~/.bashrc
|
||||||
|
conda create -n myenv python=3.9 -y
|
||||||
|
conda activate myenv
|
||||||
|
python --version
|
||||||
|
- run:
|
||||||
|
name: Install Dependencies
|
||||||
|
command: |
|
||||||
|
pip install "pytest==7.3.1"
|
||||||
|
pip install "pytest-asyncio==0.21.1"
|
||||||
|
pip install aiohttp
|
||||||
|
pip install openai
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -r .circleci/requirements.txt
|
||||||
|
pip install "pytest==7.3.1"
|
||||||
|
pip install "pytest-asyncio==0.21.1"
|
||||||
|
pip install mypy
|
||||||
|
pip install "google-generativeai>=0.3.2"
|
||||||
|
pip install "google-cloud-aiplatform>=1.38.0"
|
||||||
|
pip install "boto3>=1.28.57"
|
||||||
|
pip install langchain
|
||||||
|
pip install "langfuse>=2.0.0"
|
||||||
|
pip install numpydoc
|
||||||
|
pip install prisma
|
||||||
|
pip install "httpx==0.24.1"
|
||||||
|
pip install "gunicorn==21.2.0"
|
||||||
|
pip install "anyio==3.7.1"
|
||||||
|
pip install "aiodynamo==23.10.1"
|
||||||
|
pip install "asyncio==3.4.3"
|
||||||
|
pip install "PyGithub==1.59.1"
|
||||||
|
# Run pytest and generate JUnit XML report
|
||||||
- run:
|
- run:
|
||||||
name: Build Docker image
|
name: Build Docker image
|
||||||
command: docker build -t my-app:latest -f Dockerfile.database .
|
command: docker build -t my-app:latest -f Dockerfile.database .
|
||||||
|
@ -106,15 +144,20 @@ jobs:
|
||||||
docker run -d \
|
docker run -d \
|
||||||
-p 4000:4000 \
|
-p 4000:4000 \
|
||||||
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
|
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
|
||||||
-e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \
|
-e AZURE_API_KEY=$AZURE_API_KEY \
|
||||||
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
|
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
|
||||||
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
|
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
|
||||||
|
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
|
-e AWS_REGION_NAME=$AWS_REGION_NAME \
|
||||||
--name my-app \
|
--name my-app \
|
||||||
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
|
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
|
||||||
my-app:latest \
|
my-app:latest \
|
||||||
--config /app/config.yaml \
|
--config /app/config.yaml \
|
||||||
--port 4000 \
|
--port 4000 \
|
||||||
--num_workers 8
|
--num_workers 8 \
|
||||||
|
--detailed_debug \
|
||||||
|
--run_gunicorn \
|
||||||
- run:
|
- run:
|
||||||
name: Install curl and dockerize
|
name: Install curl and dockerize
|
||||||
command: |
|
command: |
|
||||||
|
@ -125,63 +168,22 @@ jobs:
|
||||||
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
|
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
|
||||||
- run:
|
- run:
|
||||||
name: Start outputting logs
|
name: Start outputting logs
|
||||||
command: |
|
command: docker logs -f my-app
|
||||||
while true; do
|
|
||||||
docker logs my-app
|
|
||||||
sleep 10
|
|
||||||
done
|
|
||||||
background: true
|
background: true
|
||||||
- run:
|
- run:
|
||||||
name: Wait for app to be ready
|
name: Wait for app to be ready
|
||||||
command: dockerize -wait http://localhost:4000 -timeout 1m
|
command: dockerize -wait http://localhost:4000 -timeout 1m
|
||||||
- run:
|
- run:
|
||||||
name: Test the application
|
name: Run tests
|
||||||
command: |
|
command: |
|
||||||
mkdir -p /tmp/responses
|
pwd
|
||||||
for i in {1..10}; do
|
ls
|
||||||
status_file="/tmp/responses/status_${i}.txt"
|
python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
|
||||||
response_file="/tmp/responses/response_${i}.json"
|
no_output_timeout: 120m
|
||||||
|
|
||||||
(curl --location --request POST 'http://0.0.0.0:4000/key/generate' \
|
# Store test results
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
- store_test_results:
|
||||||
--header 'Content-Type: application/json' \
|
path: test-results
|
||||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
|
|
||||||
--silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
|
|
||||||
|
|
||||||
# Capture PIDs of background processes
|
|
||||||
pids[${i}]=$!
|
|
||||||
done
|
|
||||||
|
|
||||||
# Wait for all background processes to finish
|
|
||||||
for pid in ${pids[*]}; do
|
|
||||||
wait $pid
|
|
||||||
done
|
|
||||||
|
|
||||||
# Check all responses and status codes
|
|
||||||
fail=false
|
|
||||||
for i in {1..10}; do
|
|
||||||
status=$(cat "/tmp/responses/status_${i}.txt")
|
|
||||||
|
|
||||||
# Here, we need to set the correct response file path for each iteration
|
|
||||||
response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
|
|
||||||
|
|
||||||
response=$(cat "${response_file}")
|
|
||||||
echo "Response ${i} (Status code: ${status}):"
|
|
||||||
echo "${response}" # Use echo here to print the contents
|
|
||||||
echo # Additional newline for readability
|
|
||||||
|
|
||||||
if [ "$status" -ne 200 ]; then
|
|
||||||
echo "A request did not return a 200 status code: $status"
|
|
||||||
fail=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# If any request did not return status code 200, fail the job
|
|
||||||
if [ "$fail" = true ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "All requests returned a 200 status code."
|
|
||||||
|
|
||||||
publish_to_pypi:
|
publish_to_pypi:
|
||||||
docker:
|
docker:
|
||||||
|
|
33
.github/workflows/ghcr_deploy.yml
vendored
|
@ -41,6 +41,7 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
file: Dockerfile.database
|
file: Dockerfile.database
|
||||||
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
|
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
|
||||||
|
|
||||||
build-and-push-image:
|
build-and-push-image:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
|
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
|
||||||
|
@ -74,7 +75,9 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
|
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
build-and-push-image-alpine:
|
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
|
||||||
|
|
||||||
|
build-and-push-image-ui:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
@ -90,20 +93,21 @@ jobs:
|
||||||
username: ${{ github.actor }}
|
username: ${{ github.actor }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Extract metadata (tags, labels) for Alpine Dockerfile
|
- name: Extract metadata (tags, labels) for UI Dockerfile
|
||||||
id: meta-alpine
|
id: meta-ui
|
||||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||||
with:
|
with:
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
|
||||||
|
|
||||||
- name: Build and push Alpine Docker image
|
- name: Build and push UI Docker image
|
||||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: ui/
|
||||||
file: Dockerfile.alpine
|
file: ui/Dockerfile
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest
|
tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
|
||||||
labels: ${{ steps.meta-alpine.outputs.labels }}
|
labels: ${{ steps.meta-ui.outputs.labels }}
|
||||||
|
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
|
||||||
build-and-push-image-database:
|
build-and-push-image-database:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
|
@ -168,3 +172,14 @@ jobs:
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
core.setFailed(error.message);
|
core.setFailed(error.message);
|
||||||
}
|
}
|
||||||
|
- name: Github Releases To Discord
|
||||||
|
uses: SethCohen/github-releases-to-discord@v1.13.1
|
||||||
|
with:
|
||||||
|
webhook_url: ${{ secrets.WEBHOOK_URL }}
|
||||||
|
color: "2105893"
|
||||||
|
username: "Release Changelog"
|
||||||
|
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||||
|
content: "||@everyone||"
|
||||||
|
footer_title: "Changelog"
|
||||||
|
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||||
|
footer_timestamp: true
|
5
.gitignore
vendored
|
@ -35,3 +35,8 @@ hosted_config.yaml
|
||||||
litellm/proxy/tests/node_modules
|
litellm/proxy/tests/node_modules
|
||||||
litellm/proxy/tests/package.json
|
litellm/proxy/tests/package.json
|
||||||
litellm/proxy/tests/package-lock.json
|
litellm/proxy/tests/package-lock.json
|
||||||
|
ui/litellm-dashboard/.next
|
||||||
|
ui/litellm-dashboard/node_modules
|
||||||
|
ui/litellm-dashboard/next-env.d.ts
|
||||||
|
ui/litellm-dashboard/package.json
|
||||||
|
ui/litellm-dashboard/package-lock.json
|
|
@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
|
||||||
EXPOSE 4000/tcp
|
EXPOSE 4000/tcp
|
||||||
|
|
||||||
ENTRYPOINT ["litellm"]
|
ENTRYPOINT ["litellm"]
|
||||||
CMD ["--port", "4000"]
|
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
|
|
@ -56,4 +56,4 @@ EXPOSE 4000/tcp
|
||||||
# # Set your entrypoint and command
|
# # Set your entrypoint and command
|
||||||
|
|
||||||
ENTRYPOINT ["litellm"]
|
ENTRYPOINT ["litellm"]
|
||||||
CMD ["--port", "4000"]
|
CMD ["--port", "4000", "--run_gunicorn"]
|
||||||
|
|
34
cookbook/misc/openai_timeouts.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import os
|
||||||
|
from openai import OpenAI
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import httpx
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
# This is the default and can be omitted
|
||||||
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_chat_completion():
|
||||||
|
return client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Say this is a test. Respond in 20 lines",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
# Set a timeout of 10 seconds
|
||||||
|
future = executor.submit(create_chat_completion)
|
||||||
|
try:
|
||||||
|
chat_completion = future.result(timeout=0.00001)
|
||||||
|
print(chat_completion)
|
||||||
|
except concurrent.futures.TimeoutError:
|
||||||
|
print("Operation timed out.")
|
61
cookbook/misc/sagmaker_streaming.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# Notes - on how to do sagemaker streaming using boto3
|
||||||
|
import json
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
import sys, os
|
||||||
|
import traceback
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
import os, io
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
import pytest
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class TokenIterator:
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.byte_iterator = iter(stream)
|
||||||
|
self.buffer = io.BytesIO()
|
||||||
|
self.read_pos = 0
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
while True:
|
||||||
|
self.buffer.seek(self.read_pos)
|
||||||
|
line = self.buffer.readline()
|
||||||
|
if line and line[-1] == ord("\n"):
|
||||||
|
self.read_pos += len(line) + 1
|
||||||
|
full_line = line[:-1].decode("utf-8")
|
||||||
|
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||||
|
return line_data["token"]["text"]
|
||||||
|
chunk = next(self.byte_iterator)
|
||||||
|
self.buffer.seek(0, io.SEEK_END)
|
||||||
|
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||||
|
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"inputs": "How do I build a website?",
|
||||||
|
"parameters": {"max_new_tokens": 256},
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
client = boto3.client("sagemaker-runtime", region_name="us-west-2")
|
||||||
|
response = client.invoke_endpoint_with_response_stream(
|
||||||
|
EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
|
||||||
|
Body=json.dumps(payload),
|
||||||
|
ContentType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
# for token in TokenIterator(response["Body"]):
|
||||||
|
# print(token)
|
|
@ -1,12 +0,0 @@
|
||||||
version: "3.9"
|
|
||||||
services:
|
|
||||||
litellm:
|
|
||||||
image: ghcr.io/berriai/litellm:main
|
|
||||||
ports:
|
|
||||||
- "8000:8000" # Map the container port to the host, change the host port if necessary
|
|
||||||
volumes:
|
|
||||||
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
|
||||||
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
|
||||||
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
|
|
||||||
|
|
||||||
# ...rest of your docker-compose config if any
|
|
15
docker-compose.yml
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
version: "3.9"
|
||||||
|
services:
|
||||||
|
litellm:
|
||||||
|
image: ghcr.io/berriai/litellm:main-latest
|
||||||
|
volumes:
|
||||||
|
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
|
||||||
|
ports:
|
||||||
|
- "4000:4000"
|
||||||
|
environment:
|
||||||
|
- AZURE_API_KEY=sk-123
|
||||||
|
litellm-ui:
|
||||||
|
image: ghcr.io/berriai/litellm-ui:main-latest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
|
||||||
|
|
||||||
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
||||||
|
|
||||||
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
||||||
```
|
```python
|
||||||
input=["good morning from litellm"]
|
input=["good morning from litellm"]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -22,7 +22,11 @@ input=["good morning from litellm"]
|
||||||
|
|
||||||
- `user`: *string (optional)* A unique identifier representing your end-user,
|
- `user`: *string (optional)* A unique identifier representing your end-user,
|
||||||
|
|
||||||
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
|
||||||
|
|
||||||
|
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
|
||||||
|
|
||||||
|
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
||||||
|
|
||||||
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
|
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
|
||||||
|
|
||||||
|
@ -66,11 +70,18 @@ input=["good morning from litellm"]
|
||||||
from litellm import embedding
|
from litellm import embedding
|
||||||
import os
|
import os
|
||||||
os.environ['OPENAI_API_KEY'] = ""
|
os.environ['OPENAI_API_KEY'] = ""
|
||||||
response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
|
response = embedding(
|
||||||
|
model="text-embedding-3-small",
|
||||||
|
input=["good morning from litellm", "this is another item"],
|
||||||
|
metadata={"anything": "good day"},
|
||||||
|
dimensions=5 # Only supported in text-embedding-3 and later models.
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
| Model Name | Function Call | Required OS Variables |
|
| Model Name | Function Call | Required OS Variables |
|
||||||
|----------------------|---------------------------------------------|--------------------------------------|
|
|----------------------|---------------------------------------------|--------------------------------------|
|
||||||
|
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||||
|
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||||
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
|
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||||
|
|
||||||
## Azure OpenAI Embedding Models
|
## Azure OpenAI Embedding Models
|
||||||
|
|
|
@ -28,6 +28,8 @@ import litellm
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ["LANGSMITH_API_KEY"] = ""
|
os.environ["LANGSMITH_API_KEY"] = ""
|
||||||
|
os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
|
||||||
|
os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
|
||||||
# LLM API Keys
|
# LLM API Keys
|
||||||
os.environ['OPENAI_API_KEY']=""
|
os.environ['OPENAI_API_KEY']=""
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# Gemini-Pro
|
# Gemini-Pro
|
||||||
## Sample Usage
|
## Sample Usage
|
||||||
```python
|
```python
|
||||||
import litellm
|
from litellm import completion
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['GEMINI_API_KEY'] = ""
|
os.environ['GEMINI_API_KEY'] = ""
|
||||||
|
@ -24,7 +24,7 @@ LiteLLM Supports the following image types passed in `url`
|
||||||
## Sample Usage
|
## Sample Usage
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
import litellm
|
import litellm
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Load the environment variables from .env file
|
# Load the environment variables from .env file
|
||||||
|
|
|
@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
|
||||||
|
|
||||||
| Model Name | Function Call |
|
| Model Name | Function Call |
|
||||||
|-----------------------|-----------------------------------------------------------------|
|
|-----------------------|-----------------------------------------------------------------|
|
||||||
|
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
|
||||||
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
|
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
|
||||||
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
|
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
|
||||||
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
|
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
## Sample Usage
|
## Sample Usage
|
||||||
```python
|
```python
|
||||||
import litellm
|
from litellm import completion
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['PALM_API_KEY'] = ""
|
os.environ['PALM_API_KEY'] = ""
|
||||||
|
@ -17,7 +17,7 @@ response = completion(
|
||||||
|
|
||||||
## Sample Usage - Streaming
|
## Sample Usage - Streaming
|
||||||
```python
|
```python
|
||||||
import litellm
|
from litellm import completion
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['PALM_API_KEY'] = ""
|
os.environ['PALM_API_KEY'] = ""
|
||||||
|
|
|
@ -17,7 +17,28 @@ import litellm
|
||||||
litellm.vertex_project = "hardy-device-38811" # Your Project ID
|
litellm.vertex_project = "hardy-device-38811" # Your Project ID
|
||||||
litellm.vertex_location = "us-central1" # proj location
|
litellm.vertex_location = "us-central1" # proj location
|
||||||
|
|
||||||
response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
|
response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
|
||||||
|
```
|
||||||
|
|
||||||
|
## OpenAI Proxy Usage
|
||||||
|
|
||||||
|
1. Modify the config.yaml
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
litellm_settings:
|
||||||
|
vertex_project: "hardy-device-38811" # Your Project ID
|
||||||
|
vertex_location: "us-central1" # proj location
|
||||||
|
|
||||||
|
model_list:
|
||||||
|
-model_name: team1-gemini-pro
|
||||||
|
litellm_params:
|
||||||
|
model: gemini-pro
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start the proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
## Set Vertex Project & Vertex Location
|
## Set Vertex Project & Vertex Location
|
||||||
|
|
|
@ -11,7 +11,7 @@ pip install litellm vllm
|
||||||
```python
|
```python
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
response = completion(
|
response = litellm.completion(
|
||||||
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
|
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
|
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
|
||||||
```python
|
```python
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
response = completion(
|
response = litellm.completion(
|
||||||
model="openai/facebook/opt-125m", # pass the vllm model name
|
model="openai/facebook/opt-125m", # pass the vllm model name
|
||||||
messages=messages,
|
messages=messages,
|
||||||
api_base="https://hosted-vllm-api.co",
|
api_base="https://hosted-vllm-api.co",
|
||||||
|
|
|
@ -1,6 +1,13 @@
|
||||||
# Slack Alerting
|
# Slack Alerting
|
||||||
|
|
||||||
Get alerts for failed db read/writes, hanging api calls, failed api calls.
|
Get alerts for:
|
||||||
|
- hanging LLM api calls
|
||||||
|
- failed LLM api calls
|
||||||
|
- slow LLM api calls
|
||||||
|
- budget Tracking per key/user:
|
||||||
|
- When a User/Key crosses their Budget
|
||||||
|
- When a User/Key is 15% away from crossing their Budget
|
||||||
|
- failed db read/writes
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Modify Incoming Data
|
# Modify / Reject Incoming Requests
|
||||||
|
|
||||||
Modify data just before making litellm completion calls call on proxy
|
Modify data just before making litellm completion calls call on proxy
|
||||||
|
|
||||||
|
|
|
@ -22,18 +22,22 @@ Set a model alias for your deployments.
|
||||||
|
|
||||||
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
||||||
|
|
||||||
In the config below requests with:
|
In the config below:
|
||||||
|
- `model_name`: the name to pass TO litellm from the external client
|
||||||
|
- `litellm_params.model`: the model string passed to the litellm.completion() function
|
||||||
|
|
||||||
|
E.g.:
|
||||||
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
|
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
|
||||||
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
|
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: gpt-3.5-turbo # user-facing model alias
|
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
|
||||||
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
|
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
|
||||||
model: azure/gpt-turbo-small-eu
|
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
|
||||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||||
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
|
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
|
||||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
|
||||||
- model_name: bedrock-claude-v1
|
- model_name: bedrock-claude-v1
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: bedrock/anthropic.claude-instant-v1
|
model: bedrock/anthropic.claude-instant-v1
|
||||||
|
@ -43,6 +47,11 @@ model_list:
|
||||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
api_key: "os.environ/AZURE_API_KEY_CA"
|
api_key: "os.environ/AZURE_API_KEY_CA"
|
||||||
rpm: 6
|
rpm: 6
|
||||||
|
- model_name: anthropic-claude
|
||||||
|
litellm_params:
|
||||||
|
model="bedrock/anthropic.claude-instant-v1"
|
||||||
|
### [OPTIONAL] SET AWS REGION ###
|
||||||
|
aws_region_name="us-east-1"
|
||||||
- model_name: vllm-models
|
- model_name: vllm-models
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||||
|
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||||
```
|
```
|
||||||
|
:::info
|
||||||
|
|
||||||
|
For more provider-specific info, [go here](../providers/)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
#### Step 2: Start Proxy with config
|
#### Step 2: Start Proxy with config
|
||||||
|
|
||||||
|
@ -188,7 +202,7 @@ print(response)
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
|
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
||||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||||
|
|
||||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||||
|
@ -210,6 +224,12 @@ model_list:
|
||||||
api_key: sk-123
|
api_key: sk-123
|
||||||
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
|
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
|
||||||
temperature: 0.2
|
temperature: 0.2
|
||||||
|
- model_name: openai-gpt-3.5
|
||||||
|
litellm_params:
|
||||||
|
model: openai/gpt-3.5-turbo
|
||||||
|
api_key: sk-123
|
||||||
|
organization: org-ikDc4ex8NB
|
||||||
|
temperature: 0.2
|
||||||
- model_name: mistral-7b
|
- model_name: mistral-7b
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: ollama/mistral
|
model: ollama/mistral
|
||||||
|
@ -483,3 +503,55 @@ general_settings:
|
||||||
max_parallel_requests: 100 # max parallel requests for a user = 100
|
max_parallel_requests: 100 # max parallel requests for a user = 100
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## All settings
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"environment_variables": {},
|
||||||
|
"model_list": [
|
||||||
|
{
|
||||||
|
"model_name": "string",
|
||||||
|
"litellm_params": {},
|
||||||
|
"model_info": {
|
||||||
|
"id": "string",
|
||||||
|
"mode": "embedding",
|
||||||
|
"input_cost_per_token": 0,
|
||||||
|
"output_cost_per_token": 0,
|
||||||
|
"max_tokens": 2048,
|
||||||
|
"base_model": "gpt-4-1106-preview",
|
||||||
|
"additionalProp1": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
|
||||||
|
"general_settings": {
|
||||||
|
"completion_model": "string",
|
||||||
|
"key_management_system": "google_kms", # either google_kms or azure_kms
|
||||||
|
"master_key": "string",
|
||||||
|
"database_url": "string",
|
||||||
|
"database_type": "dynamo_db",
|
||||||
|
"database_args": {
|
||||||
|
"billing_mode": "PROVISIONED_THROUGHPUT",
|
||||||
|
"read_capacity_units": 0,
|
||||||
|
"write_capacity_units": 0,
|
||||||
|
"ssl_verify": true,
|
||||||
|
"region_name": "string",
|
||||||
|
"user_table_name": "LiteLLM_UserTable",
|
||||||
|
"key_table_name": "LiteLLM_VerificationToken",
|
||||||
|
"config_table_name": "LiteLLM_Config",
|
||||||
|
"spend_table_name": "LiteLLM_SpendLogs"
|
||||||
|
},
|
||||||
|
"otel": true,
|
||||||
|
"custom_auth": "string",
|
||||||
|
"max_parallel_requests": 0,
|
||||||
|
"infer_model_from_keys": true,
|
||||||
|
"background_health_checks": true,
|
||||||
|
"health_check_interval": 300,
|
||||||
|
"alerting": [
|
||||||
|
"string"
|
||||||
|
],
|
||||||
|
"alerting_threshold": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
115
docs/my-website/docs/proxy/custom_pricing.md
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
|
# Custom Pricing - Sagemaker, etc.
|
||||||
|
|
||||||
|
Use this to register custom pricing for models.
|
||||||
|
|
||||||
|
There's 2 ways to track cost:
|
||||||
|
- cost per token
|
||||||
|
- cost per second
|
||||||
|
|
||||||
|
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
Register custom pricing for sagemaker completion model.
|
||||||
|
|
||||||
|
For cost per second pricing, you **just** need to register `input_cost_per_second`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# !pip install boto3
|
||||||
|
from litellm import completion, completion_cost
|
||||||
|
|
||||||
|
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||||
|
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_sagemaker():
|
||||||
|
try:
|
||||||
|
print("testing sagemaker")
|
||||||
|
response = completion(
|
||||||
|
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
input_cost_per_second=0.000420,
|
||||||
|
)
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
print(response)
|
||||||
|
cost = completion_cost(completion_response=response)
|
||||||
|
print(cost)
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage with OpenAI Proxy Server
|
||||||
|
|
||||||
|
**Step 1: Add pricing to config.yaml**
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: sagemaker-completion-model
|
||||||
|
litellm_params:
|
||||||
|
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
|
||||||
|
input_cost_per_second: 0.000420
|
||||||
|
- model_name: sagemaker-embedding-model
|
||||||
|
litellm_params:
|
||||||
|
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
|
||||||
|
input_cost_per_second: 0.000420
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Start proxy**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: View Spend Logs**
|
||||||
|
|
||||||
|
<Image img={require('../../img/spend_logs_table.png')} />
|
||||||
|
|
||||||
|
## Cost Per Token (e.g. Azure)
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
# !pip install boto3
|
||||||
|
from litellm import completion, completion_cost
|
||||||
|
|
||||||
|
## set ENV variables
|
||||||
|
os.environ["AZURE_API_KEY"] = ""
|
||||||
|
os.environ["AZURE_API_BASE"] = ""
|
||||||
|
os.environ["AZURE_API_VERSION"] = ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_azure_model():
|
||||||
|
try:
|
||||||
|
print("testing azure custom pricing")
|
||||||
|
# azure call
|
||||||
|
response = completion(
|
||||||
|
model = "azure/<your_deployment_name>",
|
||||||
|
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||||
|
input_cost_per_token=0.005,
|
||||||
|
output_cost_per_token=1,
|
||||||
|
)
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
print(response)
|
||||||
|
cost = completion_cost(completion_response=response)
|
||||||
|
print(cost)
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
test_completion_azure_model()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage with OpenAI Proxy Server
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: azure-model
|
||||||
|
litellm_params:
|
||||||
|
model: azure/<your_deployment_name>
|
||||||
|
api_key: os.environ/AZURE_API_KEY
|
||||||
|
api_base: os.environ/AZURE_API_BASE
|
||||||
|
api_version: os.envrion/AZURE_API_VERSION
|
||||||
|
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
|
||||||
|
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
|
||||||
|
```
|
34
docs/my-website/docs/proxy/debugging.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# Debugging
|
||||||
|
|
||||||
|
2 levels of debugging supported.
|
||||||
|
|
||||||
|
- debug (prints info logs)
|
||||||
|
- detailed debug (prints debug logs)
|
||||||
|
|
||||||
|
## `debug`
|
||||||
|
|
||||||
|
**via cli**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --debug
|
||||||
|
```
|
||||||
|
|
||||||
|
**via env**
|
||||||
|
|
||||||
|
```python
|
||||||
|
os.environ["LITELLM_LOG"] = "INFO"
|
||||||
|
```
|
||||||
|
|
||||||
|
## `detailed debug`
|
||||||
|
|
||||||
|
**via cli**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --detailed_debug
|
||||||
|
```
|
||||||
|
|
||||||
|
**via env**
|
||||||
|
|
||||||
|
```python
|
||||||
|
os.environ["LITELLM_LOG"] = "DEBUG"
|
||||||
|
```
|
|
@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml
|
||||||
|
|
||||||
The proxy exposes:
|
The proxy exposes:
|
||||||
* a /health endpoint which returns the health of the LLM APIs
|
* a /health endpoint which returns the health of the LLM APIs
|
||||||
* a /test endpoint which makes a ping to the litellm server
|
* a /health/readiness endpoint for returning if the proxy is ready to accept requests
|
||||||
|
* a /health/liveliness endpoint for returning if the proxy is alive
|
||||||
|
|
||||||
|
## `/health`
|
||||||
#### Request
|
#### Request
|
||||||
Make a GET Request to `/health` on the proxy
|
Make a GET Request to `/health` on the proxy
|
||||||
```shell
|
```shell
|
||||||
|
@ -39,7 +41,7 @@ litellm --health
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Background Health Checks
|
### Background Health Checks
|
||||||
|
|
||||||
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
|
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
|
||||||
|
|
||||||
|
@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
|
||||||
curl --location 'http://0.0.0.0:8000/health'
|
curl --location 'http://0.0.0.0:8000/health'
|
||||||
```
|
```
|
||||||
|
|
||||||
## Embedding Models
|
### Embedding Models
|
||||||
|
|
||||||
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
||||||
|
|
||||||
|
@ -77,7 +79,7 @@ model_list:
|
||||||
mode: embedding # 👈 ADD THIS
|
mode: embedding # 👈 ADD THIS
|
||||||
```
|
```
|
||||||
|
|
||||||
## Text Completion Models
|
### Text Completion Models
|
||||||
|
|
||||||
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
|
||||||
|
|
||||||
|
@ -92,3 +94,54 @@ model_list:
|
||||||
model_info:
|
model_info:
|
||||||
mode: completion # 👈 ADD THIS
|
mode: completion # 👈 ADD THIS
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## `/health/readiness`
|
||||||
|
|
||||||
|
Unprotected endpoint for checking if proxy is ready to accept requests
|
||||||
|
|
||||||
|
Example Request:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:8000/health/readiness'
|
||||||
|
```
|
||||||
|
|
||||||
|
Example Response:
|
||||||
|
|
||||||
|
*If proxy connected to a database*
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "healthy",
|
||||||
|
"db": "connected",
|
||||||
|
"litellm_version":"1.19.2",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
*If proxy not connected to a database*
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "healthy",
|
||||||
|
"db": "Not connected",
|
||||||
|
"litellm_version":"1.19.2",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## `/health/liveliness`
|
||||||
|
|
||||||
|
Unprotected endpoint for checking if proxy is alive
|
||||||
|
|
||||||
|
|
||||||
|
Example Request:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl -X 'GET' \
|
||||||
|
'http://0.0.0.0:8000/health/liveliness' \
|
||||||
|
-H 'accept: application/json'
|
||||||
|
```
|
||||||
|
|
||||||
|
Example Response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"I'm alive!"
|
||||||
|
```
|
|
@ -1,5 +1,4 @@
|
||||||
|
# Multiple Instances of 1 model
|
||||||
# Load Balancing - Multiple Instances of 1 model
|
|
||||||
Load balance multiple instances of the same model
|
Load balance multiple instances of the same model
|
||||||
|
|
||||||
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**
|
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**
|
||||||
|
|
|
@ -40,115 +40,6 @@ litellm --test
|
||||||
|
|
||||||
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
|
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
|
||||||
|
|
||||||
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="Curl" label="Curl Request">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data ' {
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "what llm are you"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
'
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
client = openai.OpenAI(
|
|
||||||
api_key="anything",
|
|
||||||
base_url="http://0.0.0.0:8000"
|
|
||||||
)
|
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
|
||||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "this is a test request, write a short poem"
|
|
||||||
}
|
|
||||||
])
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="langchain" label="Langchain">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from langchain.chat_models import ChatOpenAI
|
|
||||||
from langchain.prompts.chat import (
|
|
||||||
ChatPromptTemplate,
|
|
||||||
HumanMessagePromptTemplate,
|
|
||||||
SystemMessagePromptTemplate,
|
|
||||||
)
|
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
|
||||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
|
||||||
model = "gpt-3.5-turbo",
|
|
||||||
temperature=0.1
|
|
||||||
)
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
SystemMessage(
|
|
||||||
content="You are a helpful assistant that im using to make a test request to."
|
|
||||||
),
|
|
||||||
HumanMessage(
|
|
||||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
|
||||||
),
|
|
||||||
]
|
|
||||||
response = chat(messages)
|
|
||||||
|
|
||||||
print(response)
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
|
||||||
|
|
||||||
```python
|
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
|
||||||
|
|
||||||
|
|
||||||
text = "This is a test document."
|
|
||||||
|
|
||||||
query_result = embeddings.embed_query(text)
|
|
||||||
|
|
||||||
print(f"SAGEMAKER EMBEDDINGS")
|
|
||||||
print(query_result[:5])
|
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
|
||||||
|
|
||||||
text = "This is a test document."
|
|
||||||
|
|
||||||
query_result = embeddings.embed_query(text)
|
|
||||||
|
|
||||||
print(f"BEDROCK EMBEDDINGS")
|
|
||||||
print(query_result[:5])
|
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
|
||||||
|
|
||||||
text = "This is a test document."
|
|
||||||
|
|
||||||
query_result = embeddings.embed_query(text)
|
|
||||||
|
|
||||||
print(f"TITAN EMBEDDINGS")
|
|
||||||
print(query_result[:5])
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
|
|
||||||
### Supported LLMs
|
### Supported LLMs
|
||||||
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
|
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
|
||||||
<Tabs>
|
<Tabs>
|
||||||
|
@ -331,6 +222,113 @@ $ litellm --model command-nightly
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:8000"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
||||||
|
model = "gpt-3.5-turbo",
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||||
|
|
||||||
|
|
||||||
|
text = "This is a test document."
|
||||||
|
|
||||||
|
query_result = embeddings.embed_query(text)
|
||||||
|
|
||||||
|
print(f"SAGEMAKER EMBEDDINGS")
|
||||||
|
print(query_result[:5])
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||||
|
|
||||||
|
text = "This is a test document."
|
||||||
|
|
||||||
|
query_result = embeddings.embed_query(text)
|
||||||
|
|
||||||
|
print(f"BEDROCK EMBEDDINGS")
|
||||||
|
print(query_result[:5])
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||||
|
|
||||||
|
text = "This is a test document."
|
||||||
|
|
||||||
|
query_result = embeddings.embed_query(text)
|
||||||
|
|
||||||
|
print(f"TITAN EMBEDDINGS")
|
||||||
|
print(query_result[:5])
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Quick Start - LiteLLM Proxy + Config.yaml
|
## Quick Start - LiteLLM Proxy + Config.yaml
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import Image from '@theme/IdealImage';
|
import Image from '@theme/IdealImage';
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# [BETA] Self-serve UI
|
# [BETA] Admin UI
|
||||||
|
|
||||||
Allow your users to create their own keys through a UI
|
|
||||||
|
|
||||||
:::info
|
:::info
|
||||||
|
|
||||||
|
@ -10,40 +10,94 @@ This is in beta, so things may change. If you have feedback, [let us know](https
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
Allow your users to create, view their own keys through a UI
|
||||||
|
|
||||||
|
<Image img={require('../../img/admin_ui_2.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
Requirements:
|
## 1. Setup SSO/Auth for UI
|
||||||
|
|
||||||
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
|
<Tabs>
|
||||||
|
|
||||||
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
|
<TabItem value="google" label="Google SSO">
|
||||||
|
|
||||||
### Step 1. Save SMTP server credentials
|
- Create a new Oauth 2.0 Client on https://console.cloud.google.com/
|
||||||
|
|
||||||
```env
|
**Required .env variables on your Proxy**
|
||||||
export SMTP_HOST="my-smtp-host"
|
```shell
|
||||||
export SMTP_USERNAME="my-smtp-password"
|
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
|
||||||
export SMTP_PASSWORD="my-smtp-password"
|
|
||||||
export SMTP_SENDER_EMAIL="krrish@berri.ai"
|
# for Google SSO Login
|
||||||
|
GOOGLE_CLIENT_ID=
|
||||||
|
GOOGLE_CLIENT_SECRET=
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2. Enable user auth
|
- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
|
||||||
|
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||||
|
```shell
|
||||||
|
https://litellm-production-7002.up.railway.app/sso/callback
|
||||||
|
```
|
||||||
|
|
||||||
In your config.yaml,
|
</TabItem>
|
||||||
|
|
||||||
```yaml
|
<TabItem value="msft" label="Microsoft SSO">
|
||||||
general_settings:
|
|
||||||
# other changes
|
- Create a new App Registration on https://portal.azure.com/
|
||||||
allow_user_auth: true
|
- Create a client Secret for your App Registration
|
||||||
|
|
||||||
|
**Required .env variables on your Proxy**
|
||||||
|
```shell
|
||||||
|
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
|
||||||
|
|
||||||
|
MICROSOFT_CLIENT_ID="84583a4d-"
|
||||||
|
MICROSOFT_CLIENT_SECRET="nbk8Q~"
|
||||||
|
MICROSOFT_TENANT="5a39737
|
||||||
|
```
|
||||||
|
- Set Redirect URI on your App Registration on https://portal.azure.com/
|
||||||
|
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||||
|
```shell
|
||||||
|
http://localhost:4000/sso/callback
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="username" label="Quick Start - Username, Password">
|
||||||
|
|
||||||
|
Set the following in your .env on the Proxy
|
||||||
|
|
||||||
|
```shell
|
||||||
|
PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
|
||||||
|
|
||||||
|
UI_USERNAME=ishaan-litellm
|
||||||
|
UI_PASSWORD=langchain
|
||||||
```
|
```
|
||||||
|
|
||||||
This will enable:
|
On accessing the LiteLLM UI, you will be prompted to enter your username, password
|
||||||
* Users to create keys via `/key/generate` (by default, only admin can create keys)
|
|
||||||
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
|
|
||||||
|
|
||||||
### Step 3. Connect to UI
|
</TabItem>
|
||||||
|
|
||||||
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
|
</Tabs>
|
||||||
|
|
||||||
|
## 2. Start Proxy Server
|
||||||
|
|
||||||
|
```shell
|
||||||
|
litellm --config proxy_config.yaml --port 4000
|
||||||
|
|
||||||
|
# start proxy on port 4000
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Get Admin UI Link to you on Swagger
|
||||||
|
|
||||||
|
Your Proxy Swagger is available on the root of the Proxy: `http://localhost:4000/`
|
||||||
|
|
||||||
|
<Image img={require('../../img/ui_link.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
|
||||||
|
|
||||||
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
|
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
|
||||||
|
|
||||||
|
@ -62,4 +116,13 @@ Connect your proxy to your UI, by entering:
|
||||||
|
|
||||||
### Create Keys
|
### Create Keys
|
||||||
|
|
||||||
<Image img={require('../../img/user_create_key_screen.png')} />
|
<Image img={require('../../img/user_create_key_screen.png')} />
|
||||||
|
|
||||||
|
### Spend Per Key
|
||||||
|
|
||||||
|
<Image img={require('../../img/spend_per_api_key.png')} />
|
||||||
|
|
||||||
|
### Spend Per User
|
||||||
|
|
||||||
|
<Image img={require('../../img/spend_per_user.png')} /> -->
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
# 💰 Budgets, Rate Limits per user
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# 💰 Budgets, Rate Limits
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
|
|
||||||
|
@ -6,17 +9,74 @@ Requirements:
|
||||||
|
|
||||||
|
|
||||||
## Set Budgets
|
## Set Budgets
|
||||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
|
||||||
|
|
||||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
You can set budgets at 3 levels:
|
||||||
|
- For the proxy
|
||||||
|
- For a user
|
||||||
|
- For a key
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="proxy" label="For Proxy">
|
||||||
|
|
||||||
|
Apply a budget across all calls on the proxy
|
||||||
|
|
||||||
|
**Step 1. Modify config.yaml**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
# other litellm settings
|
||||||
|
max_budget: 0 # (float) sets max budget as $0 USD
|
||||||
|
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2. Start proxy**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3. Send test call**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Autherization: Bearer sk-1234' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="per-user" label="For User">
|
||||||
|
|
||||||
|
Apply a budget across multiple keys.
|
||||||
|
|
||||||
|
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
|
||||||
|
|
||||||
|
You can:
|
||||||
|
- Add budgets to users [**Jump**](#add-budgets-to-users)
|
||||||
|
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
|
||||||
|
|
||||||
|
By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
|
|
||||||
|
### **Add budgets to users**
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://localhost:8000/user/new' \
|
curl --location 'http://localhost:8000/user/new' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||||
```
|
```
|
||||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
|
||||||
|
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
|
||||||
|
|
||||||
**Sample Response**
|
**Sample Response**
|
||||||
|
|
||||||
|
@ -29,18 +89,163 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### **Add budget duration to users**
|
||||||
|
|
||||||
|
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
|
||||||
|
```
|
||||||
|
curl 'http://0.0.0.0:8000/user/new' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"team_id": "core-infra", # [OPTIONAL]
|
||||||
|
"max_budget": 10,
|
||||||
|
"budget_duration": 10s,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create new keys for existing user
|
||||||
|
|
||||||
|
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
|
||||||
|
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
|
||||||
|
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="per-key" label="For Key">
|
||||||
|
|
||||||
|
Apply a budget on a key.
|
||||||
|
|
||||||
|
You can:
|
||||||
|
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
|
||||||
|
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
|
||||||
|
|
||||||
|
**Expected Behaviour**
|
||||||
|
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
|
||||||
|
- After the key crosses it's `max_budget`, requests fail
|
||||||
|
- If duration set, spend is reset at the end of the duration
|
||||||
|
|
||||||
|
By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
|
|
||||||
|
### **Add budgets to keys**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"team_id": "core-infra", # [OPTIONAL]
|
||||||
|
"max_budget": 10,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Example Request to `/chat/completions` when key has crossed budget
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer <generated-key>' \
|
||||||
|
--data ' {
|
||||||
|
"model": "azure-gpt-3.5",
|
||||||
|
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "respond in 50 lines"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Expected Response from `/chat/completions` when key has crossed budget
|
||||||
|
```shell
|
||||||
|
{
|
||||||
|
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### **Add budget duration to keys**
|
||||||
|
|
||||||
|
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
|
||||||
|
```
|
||||||
|
curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"team_id": "core-infra", # [OPTIONAL]
|
||||||
|
"max_budget": 10,
|
||||||
|
"budget_duration": 10s,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
## Set Rate Limits
|
## Set Rate Limits
|
||||||
|
|
||||||
Set max parallel requests a user can make, when you create user keys - `/key/generate`.
|
You can set:
|
||||||
|
- max parallel requests
|
||||||
|
- tpm limits
|
||||||
|
- rpm limits
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="per-user" label="Per User">
|
||||||
|
|
||||||
|
Use `/user/new`, to persist rate limits across multiple keys.
|
||||||
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:8000/user/new' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||||
|
```
|
||||||
|
|
||||||
|
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
|
||||||
|
|
||||||
|
**Expected Response**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
|
||||||
|
"expires": "2024-01-19T01:21:12.816168",
|
||||||
|
"user_id": "krrish@berri.ai",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="per-key" label="Per Key">
|
||||||
|
|
||||||
|
Use `/key/generate`, if you want them for just that key.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1
|
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Expected Response**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
|
||||||
|
"expires": "2024-01-18T20:48:44.297973",
|
||||||
|
"user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84" // 👈 auto-generated
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
## Grant Access to new model
|
## Grant Access to new model
|
||||||
|
|
||||||
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).
|
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Key Management
|
# Virtual Keys
|
||||||
Track Spend, Set budgets and create virtual keys for the proxy
|
Track Spend, Set budgets and create virtual keys for the proxy
|
||||||
|
|
||||||
Grant other's temporary access to your proxy, with keys that expire after a set duration.
|
Grant other's temporary access to your proxy, with keys that expire after a set duration.
|
||||||
|
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## Quick Start
|
## Setup
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
|
|
||||||
|
@ -58,36 +58,53 @@ litellm --config /path/to/config.yaml
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
|
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
|
||||||
```
|
```
|
||||||
|
|
||||||
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
|
|
||||||
|
|
||||||
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
## /key/generate
|
||||||
|
|
||||||
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
|
### Request
|
||||||
|
```shell
|
||||||
|
curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||||
|
"duration": "20m",
|
||||||
|
"metadata": {"user": "ishaan@berri.ai"},
|
||||||
|
"team_id": "core-infra",
|
||||||
|
"max_budget": 10,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
Expected response:
|
|
||||||
|
Request Params:
|
||||||
|
|
||||||
|
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
- `key_alias`: *Optional[str]* - User defined key alias
|
||||||
|
- `team_id`: *Optional[str]* - The team id of the user
|
||||||
|
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
|
||||||
|
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
|
||||||
|
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||||
|
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||||
|
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||||
|
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||||
|
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||||
|
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
```python
|
```python
|
||||||
{
|
{
|
||||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
||||||
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
|
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
|
||||||
|
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
||||||
|
...
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Keys that don't expire
|
### Upgrade/Downgrade Models
|
||||||
|
|
||||||
Just set duration to None.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
|
||||||
```
|
|
||||||
|
|
||||||
## Upgrade/Downgrade Models
|
|
||||||
|
|
||||||
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
|
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
|
||||||
|
|
||||||
|
@ -137,7 +154,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
|
||||||
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
|
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
|
||||||
|
|
||||||
|
|
||||||
## Grant Access to new model
|
### Grant Access to new model
|
||||||
|
|
||||||
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
|
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
|
||||||
|
|
||||||
|
@ -165,6 +182,188 @@ curl --location 'http://localhost:8000/key/generate' \
|
||||||
"max_budget": 0,}'
|
"max_budget": 0,}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## /key/info
|
||||||
|
|
||||||
|
### Request
|
||||||
|
```shell
|
||||||
|
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
|
||||||
|
-H "Authorization: Bearer sk-1234"
|
||||||
|
```
|
||||||
|
|
||||||
|
Request Params:
|
||||||
|
- key: str - The key you want the info for
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
|
`token` is the hashed key (The DB stores the hashed key for security)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
|
||||||
|
"info": {
|
||||||
|
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
|
||||||
|
"spend": 0.0,
|
||||||
|
"expires": "2024-01-18T23:52:09.125000+00:00",
|
||||||
|
"models": ["azure-gpt-3.5", "azure-embedding-model"],
|
||||||
|
"aliases": {},
|
||||||
|
"config": {},
|
||||||
|
"user_id": "ishaan2@berri.ai",
|
||||||
|
"team_id": "None",
|
||||||
|
"max_parallel_requests": null,
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## /key/update
|
||||||
|
|
||||||
|
### Request
|
||||||
|
```shell
|
||||||
|
curl 'http://0.0.0.0:8000/key/update' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||||
|
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||||
|
"metadata": {"user": "ishaan@berri.ai"},
|
||||||
|
"team_id": "core-infra"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Request Params:
|
||||||
|
- key: str - The key that needs to be updated.
|
||||||
|
|
||||||
|
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
|
||||||
|
|
||||||
|
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
|
||||||
|
|
||||||
|
- team_id: str or null (optional) - Specify the team_id for the associated key.
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||||
|
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||||
|
"metadata": {
|
||||||
|
"user": "ishaan@berri.ai"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## /key/delete
|
||||||
|
|
||||||
|
### Request
|
||||||
|
```shell
|
||||||
|
curl 'http://0.0.0.0:8000/key/delete' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Request Params:
|
||||||
|
- keys: List[str] - List of keys to delete
|
||||||
|
|
||||||
|
### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Default /key/generate params
|
||||||
|
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
|
||||||
|
|
||||||
|
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
|
||||||
|
|
||||||
|
Set `litellm_settings:default_key_generate_params`:
|
||||||
|
```yaml
|
||||||
|
litellm_settings:
|
||||||
|
default_key_generate_params:
|
||||||
|
max_budget: 1.5000
|
||||||
|
models: ["azure-gpt-3.5"]
|
||||||
|
duration: # blank means `null`
|
||||||
|
metadata: {"setting":"default"}
|
||||||
|
team_id: "core-infra"
|
||||||
|
```
|
||||||
|
## Set Budgets - Per Key
|
||||||
|
|
||||||
|
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{
|
||||||
|
"metadata": {"user": "ishaan@berri.ai"},
|
||||||
|
"team_id": "core-infra",
|
||||||
|
"max_budget": 10,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Expected Behaviour
|
||||||
|
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
|
||||||
|
- After the key crosses it's `max_budget`, requests fail
|
||||||
|
|
||||||
|
Example Request to `/chat/completions` when key has crossed budget
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
|
||||||
|
--data ' {
|
||||||
|
"model": "azure-gpt-3.5",
|
||||||
|
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "respond in 50 lines"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Expected Response from `/chat/completions` when key has crossed budget
|
||||||
|
```shell
|
||||||
|
{
|
||||||
|
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Set Budgets - Per User
|
||||||
|
|
||||||
|
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||||
|
|
||||||
|
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://localhost:8000/user/new' \
|
||||||
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||||
|
```
|
||||||
|
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||||
|
|
||||||
|
**Sample Response**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
{
|
||||||
|
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
|
||||||
|
"expires": "2023-12-22T09:53:13.861000Z",
|
||||||
|
"user_id": "krrish3@berri.ai",
|
||||||
|
"max_budget": 0.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Tracking Spend
|
## Tracking Spend
|
||||||
|
|
||||||
You can get spend for a key by using the `/key/info` endpoint.
|
You can get spend for a key by using the `/key/info` endpoint.
|
||||||
|
@ -200,32 +399,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Set Budgets
|
|
||||||
|
|
||||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
|
||||||
|
|
||||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl --location 'http://localhost:8000/user/new' \
|
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
|
||||||
```
|
|
||||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
|
||||||
|
|
||||||
**Sample Response**
|
|
||||||
|
|
||||||
```shell
|
|
||||||
{
|
|
||||||
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
|
|
||||||
"expires": "2023-12-22T09:53:13.861000Z",
|
|
||||||
"user_id": "krrish3@berri.ai",
|
|
||||||
"max_budget": 0.0
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Custom Auth
|
## Custom Auth
|
||||||
|
|
||||||
You can now override the default api key auth.
|
You can now override the default api key auth.
|
||||||
|
@ -275,6 +448,97 @@ general_settings:
|
||||||
$ litellm --config /path/to/config.yaml
|
$ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Custom /key/generate
|
||||||
|
|
||||||
|
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
|
||||||
|
|
||||||
|
### 1. Write a custom `custom_generate_key_fn`
|
||||||
|
|
||||||
|
|
||||||
|
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
|
||||||
|
|
||||||
|
The output of your `custom_generate_key_fn` should be a dictionary with the following structure
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"decision": False,
|
||||||
|
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
|
||||||
|
|
||||||
|
- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
|
||||||
|
"""
|
||||||
|
Asynchronous function for generating a key based on the input data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (GenerateKeyRequest): The input data for key generation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary containing the decision and an optional message.
|
||||||
|
{
|
||||||
|
"decision": False,
|
||||||
|
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# decide if a key should be generated or not
|
||||||
|
print("using custom auth function!")
|
||||||
|
data_json = data.json() # type: ignore
|
||||||
|
|
||||||
|
# Unpacking variables
|
||||||
|
team_id = data_json.get("team_id")
|
||||||
|
duration = data_json.get("duration")
|
||||||
|
models = data_json.get("models")
|
||||||
|
aliases = data_json.get("aliases")
|
||||||
|
config = data_json.get("config")
|
||||||
|
spend = data_json.get("spend")
|
||||||
|
user_id = data_json.get("user_id")
|
||||||
|
max_parallel_requests = data_json.get("max_parallel_requests")
|
||||||
|
metadata = data_json.get("metadata")
|
||||||
|
tpm_limit = data_json.get("tpm_limit")
|
||||||
|
rpm_limit = data_json.get("rpm_limit")
|
||||||
|
|
||||||
|
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
|
||||||
|
# only team_id="litellm-core-infra@gmail.com" can make keys
|
||||||
|
return {
|
||||||
|
"decision": True,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print("Failed custom auth")
|
||||||
|
return {
|
||||||
|
"decision": False,
|
||||||
|
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### 2. Pass the filepath (relative to the config.yaml)
|
||||||
|
|
||||||
|
Pass the filepath to the config.yaml
|
||||||
|
|
||||||
|
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: "openai-model"
|
||||||
|
litellm_params:
|
||||||
|
model: "gpt-3.5-turbo"
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
drop_params: True
|
||||||
|
set_verbose: True
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
custom_key_generate: custom_auth.custom_generate_key_fn
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## [BETA] Dynamo DB
|
## [BETA] Dynamo DB
|
||||||
|
|
||||||
|
|
|
@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
|
||||||
|
|
||||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||||
|
|
||||||
|
**Global Timeouts**
|
||||||
```python
|
```python
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
|
|
||||||
|
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Timeouts per model**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"timeout": 300 # sets a 5 minute timeout
|
||||||
|
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||||
|
async def router_acompletion():
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
asyncio.run(router_acompletion())
|
||||||
|
```
|
||||||
### Cooldowns
|
### Cooldowns
|
||||||
|
|
||||||
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
||||||
|
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Custom Callbacks - Track API Key, API Endpoint, Model Used
|
||||||
|
|
||||||
|
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
|
||||||
|
class MyCustomHandler(CustomLogger):
|
||||||
|
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Success")
|
||||||
|
print("kwargs=", kwargs)
|
||||||
|
litellm_params= kwargs.get("litellm_params")
|
||||||
|
api_key = litellm_params.get("api_key")
|
||||||
|
api_base = litellm_params.get("api_base")
|
||||||
|
custom_llm_provider= litellm_params.get("custom_llm_provider")
|
||||||
|
response_cost = kwargs.get("response_cost")
|
||||||
|
|
||||||
|
# print the values
|
||||||
|
print("api_key=", api_key)
|
||||||
|
print("api_base=", api_base)
|
||||||
|
print("custom_llm_provider=", custom_llm_provider)
|
||||||
|
print("response_cost=", response_cost)
|
||||||
|
|
||||||
|
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Failure")
|
||||||
|
print("kwargs=")
|
||||||
|
|
||||||
|
customHandler = MyCustomHandler()
|
||||||
|
|
||||||
|
litellm.callbacks = [customHandler]
|
||||||
|
|
||||||
|
# Init Router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
|
||||||
|
|
||||||
|
# router completion call
|
||||||
|
response = router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{ "role": "user", "content": "Hi who are you"}]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
## Deploy Router
|
## Deploy Router
|
||||||
|
|
||||||
|
@ -602,17 +676,63 @@ def __init__(
|
||||||
num_retries: int = 0,
|
num_retries: int = 0,
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||||
set_verbose: bool = False,
|
|
||||||
fallbacks: List = [],
|
fallbacks: List = [],
|
||||||
allowed_fails: Optional[int] = None,
|
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||||
|
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||||
context_window_fallbacks: List = [],
|
context_window_fallbacks: List = [],
|
||||||
model_group_alias: Optional[dict] = {},
|
model_group_alias: Optional[dict] = {},
|
||||||
retry_after: int = 0, # min time to wait before retrying a failed request
|
retry_after: int = 0, # (min) time to wait before retrying a failed request
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
"usage-based-routing",
|
"usage-based-routing",
|
||||||
"latency-based-routing",
|
"latency-based-routing",
|
||||||
] = "simple-shuffle",
|
] = "simple-shuffle",
|
||||||
|
|
||||||
|
## DEBUGGING ##
|
||||||
|
set_verbose: bool = False, # set this to True for seeing logs
|
||||||
|
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
|
||||||
):
|
):
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Debugging Router
|
||||||
|
### Basic Debugging
|
||||||
|
Set `Router(set_verbose=True)`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Detailed Debugging
|
||||||
|
Set `Router(set_verbose=True,debug_level="DEBUG")`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG" # defaults to INFO
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Very Detailed Debugging
|
||||||
|
Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG" # defaults to INFO
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
BIN
docs/my-website/img/admin_ui_2.png
Normal file
After Width: | Height: | Size: 159 KiB |
BIN
docs/my-website/img/google_oauth2.png
Normal file
After Width: | Height: | Size: 351 KiB |
BIN
docs/my-website/img/google_redirect.png
Normal file
After Width: | Height: | Size: 297 KiB |
BIN
docs/my-website/img/spend_logs_table.png
Normal file
After Width: | Height: | Size: 189 KiB |
BIN
docs/my-website/img/spend_per_api_key.png
Normal file
After Width: | Height: | Size: 468 KiB |
BIN
docs/my-website/img/spend_per_user.png
Normal file
After Width: | Height: | Size: 249 KiB |
BIN
docs/my-website/img/ui_link.png
Normal file
After Width: | Height: | Size: 69 KiB |
|
@ -104,24 +104,49 @@ const sidebars = {
|
||||||
items: [
|
items: [
|
||||||
"proxy/quick_start",
|
"proxy/quick_start",
|
||||||
"proxy/configs",
|
"proxy/configs",
|
||||||
|
{
|
||||||
|
type: 'link',
|
||||||
|
label: '📖 All Endpoints',
|
||||||
|
href: 'https://litellm-api.up.railway.app/',
|
||||||
|
},
|
||||||
"proxy/user_keys",
|
"proxy/user_keys",
|
||||||
"proxy/load_balancing",
|
|
||||||
"proxy/virtual_keys",
|
"proxy/virtual_keys",
|
||||||
"proxy/users",
|
"proxy/users",
|
||||||
"proxy/ui",
|
"proxy/ui",
|
||||||
"proxy/model_management",
|
"proxy/model_management",
|
||||||
"proxy/reliability",
|
|
||||||
"proxy/caching",
|
|
||||||
"proxy/logging",
|
|
||||||
"proxy/health",
|
"proxy/health",
|
||||||
"proxy/call_hooks",
|
"proxy/debugging",
|
||||||
"proxy/rules",
|
{
|
||||||
"proxy/alerting",
|
"type": "category",
|
||||||
"proxy/streaming_logging",
|
"label": "🔥 Load Balancing",
|
||||||
|
"items": [
|
||||||
|
"proxy/load_balancing",
|
||||||
|
"proxy/reliability",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "category",
|
||||||
|
"label": "Logging, Alerting, Caching",
|
||||||
|
"items": [
|
||||||
|
"proxy/logging",
|
||||||
|
"proxy/alerting",
|
||||||
|
"proxy/streaming_logging",
|
||||||
|
"proxy/caching",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "category",
|
||||||
|
"label": "Admin Controls",
|
||||||
|
"items": [
|
||||||
|
"proxy/call_hooks",
|
||||||
|
"proxy/rules",
|
||||||
|
]
|
||||||
|
},
|
||||||
"proxy/deploy",
|
"proxy/deploy",
|
||||||
"proxy/cli",
|
"proxy/cli",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"proxy/custom_pricing",
|
||||||
"routing",
|
"routing",
|
||||||
"rules",
|
"rules",
|
||||||
"set_keys",
|
"set_keys",
|
||||||
|
|
|
@ -2,10 +2,14 @@
|
||||||
import threading, requests
|
import threading, requests
|
||||||
from typing import Callable, List, Optional, Dict, Union, Any
|
from typing import Callable, List, Optional, Dict, Union, Any
|
||||||
from litellm.caching import Cache
|
from litellm.caching import Cache
|
||||||
from litellm._logging import set_verbose
|
from litellm._logging import set_verbose, _turn_on_debug
|
||||||
from litellm.proxy._types import KeyManagementSystem
|
from litellm.proxy._types import KeyManagementSystem
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
#############################################
|
||||||
|
if set_verbose == True:
|
||||||
|
_turn_on_debug()
|
||||||
|
#############################################
|
||||||
input_callback: List[Union[str, Callable]] = []
|
input_callback: List[Union[str, Callable]] = []
|
||||||
success_callback: List[Union[str, Callable]] = []
|
success_callback: List[Union[str, Callable]] = []
|
||||||
failure_callback: List[Union[str, Callable]] = []
|
failure_callback: List[Union[str, Callable]] = []
|
||||||
|
@ -58,6 +62,9 @@ cache: Optional[
|
||||||
model_alias_map: Dict[str, str] = {}
|
model_alias_map: Dict[str, str] = {}
|
||||||
model_group_alias_map: Dict[str, str] = {}
|
model_group_alias_map: Dict[str, str] = {}
|
||||||
max_budget: float = 0.0 # set the max budget across all providers
|
max_budget: float = 0.0 # set the max budget across all providers
|
||||||
|
budget_duration: Optional[
|
||||||
|
str
|
||||||
|
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
_openai_completion_params = [
|
_openai_completion_params = [
|
||||||
"functions",
|
"functions",
|
||||||
"function_call",
|
"function_call",
|
||||||
|
@ -136,6 +143,7 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
|
||||||
suppress_debug_info = False
|
suppress_debug_info = False
|
||||||
dynamodb_table_name: Optional[str] = None
|
dynamodb_table_name: Optional[str] = None
|
||||||
s3_callback_params: Optional[Dict] = None
|
s3_callback_params: Optional[Dict] = None
|
||||||
|
default_key_generate_params: Optional[Dict] = None
|
||||||
#### RELIABILITY ####
|
#### RELIABILITY ####
|
||||||
request_timeout: Optional[float] = 6000
|
request_timeout: Optional[float] = 6000
|
||||||
num_retries: Optional[int] = None # per model endpoint
|
num_retries: Optional[int] = None # per model endpoint
|
||||||
|
|
|
@ -7,20 +7,14 @@ handler = logging.StreamHandler()
|
||||||
handler.setLevel(logging.DEBUG)
|
handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
# Create a formatter and set it for the handler
|
# Create a formatter and set it for the handler
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
|
||||||
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
|
|
||||||
|
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
|
||||||
try:
|
|
||||||
if set_verbose:
|
|
||||||
print(print_statement) # noqa
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
|
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
|
||||||
verbose_router_logger = logging.getLogger("LiteLLM Router")
|
verbose_router_logger = logging.getLogger("LiteLLM Router")
|
||||||
verbose_logger = logging.getLogger("LiteLLM")
|
verbose_logger = logging.getLogger("LiteLLM")
|
||||||
|
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
|
||||||
# Add the handler to the logger
|
# Add the handler to the logger
|
||||||
verbose_router_logger.addHandler(handler)
|
verbose_router_logger.addHandler(handler)
|
||||||
verbose_proxy_logger.addHandler(handler)
|
verbose_proxy_logger.addHandler(handler)
|
||||||
|
verbose_logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
|
def _turn_on_debug():
|
||||||
|
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
|
||||||
|
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to debug
|
||||||
|
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
||||||
|
|
||||||
|
|
||||||
|
def print_verbose(print_statement):
|
||||||
|
try:
|
||||||
|
if set_verbose:
|
||||||
|
print(print_statement) # noqa
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
|
@ -1,3 +1,12 @@
|
||||||
|
# +-----------------------------------------------+
|
||||||
|
# | |
|
||||||
|
# | NOT PROXY BUDGET MANAGER |
|
||||||
|
# | proxy budget manager is in proxy_server.py |
|
||||||
|
# | |
|
||||||
|
# +-----------------------------------------------+
|
||||||
|
#
|
||||||
|
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||||
|
|
||||||
import os, json, time
|
import os, json, time
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.utils import ModelResponse
|
from litellm.utils import ModelResponse
|
||||||
|
@ -11,10 +20,12 @@ class BudgetManager:
|
||||||
project_name: str,
|
project_name: str,
|
||||||
client_type: str = "local",
|
client_type: str = "local",
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
headers: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
self.client_type = client_type
|
self.client_type = client_type
|
||||||
self.project_name = project_name
|
self.project_name = project_name
|
||||||
self.api_base = api_base or "https://api.litellm.ai"
|
self.api_base = api_base or "https://api.litellm.ai"
|
||||||
|
self.headers = headers or {"Content-Type": "application/json"}
|
||||||
## load the data or init the initial dictionaries
|
## load the data or init the initial dictionaries
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
|
@ -43,7 +54,7 @@ class BudgetManager:
|
||||||
url = self.api_base + "/get_budget"
|
url = self.api_base + "/get_budget"
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
data = {"project_name": self.project_name}
|
data = {"project_name": self.project_name}
|
||||||
response = requests.post(url, headers=headers, json=data)
|
response = requests.post(url, headers=self.headers, json=data)
|
||||||
response = response.json()
|
response = response.json()
|
||||||
if response["status"] == "error":
|
if response["status"] == "error":
|
||||||
self.user_dict = (
|
self.user_dict = (
|
||||||
|
@ -201,6 +212,6 @@ class BudgetManager:
|
||||||
url = self.api_base + "/set_budget"
|
url = self.api_base + "/set_budget"
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
data = {"project_name": self.project_name, "user_dict": self.user_dict}
|
data = {"project_name": self.project_name, "user_dict": self.user_dict}
|
||||||
response = requests.post(url, headers=headers, json=data)
|
response = requests.post(url, headers=self.headers, json=data)
|
||||||
response = response.json()
|
response = response.json()
|
||||||
return response
|
return response
|
||||||
|
|
|
@ -12,10 +12,12 @@ import time, logging
|
||||||
import json, traceback, ast, hashlib
|
import json, traceback, ast, hashlib
|
||||||
from typing import Optional, Literal, List, Union, Any
|
from typing import Optional, Literal, List, Union, Any
|
||||||
from openai._models import BaseModel as OpenAIObject
|
from openai._models import BaseModel as OpenAIObject
|
||||||
|
from litellm._logging import verbose_logger
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
|
verbose_logger.debug(print_statement)
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
print(print_statement) # noqa
|
print(print_statement) # noqa
|
||||||
except:
|
except:
|
||||||
|
@ -129,11 +131,13 @@ class S3Cache(BaseCache):
|
||||||
s3_aws_secret_access_key=None,
|
s3_aws_secret_access_key=None,
|
||||||
s3_aws_session_token=None,
|
s3_aws_session_token=None,
|
||||||
s3_config=None,
|
s3_config=None,
|
||||||
|
s3_path=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
import boto3
|
import boto3
|
||||||
|
|
||||||
self.bucket_name = s3_bucket_name
|
self.bucket_name = s3_bucket_name
|
||||||
|
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
|
||||||
# Create an S3 client with custom endpoint URL
|
# Create an S3 client with custom endpoint URL
|
||||||
self.s3_client = boto3.client(
|
self.s3_client = boto3.client(
|
||||||
"s3",
|
"s3",
|
||||||
|
@ -155,6 +159,8 @@ class S3Cache(BaseCache):
|
||||||
ttl = kwargs.get("ttl", None)
|
ttl = kwargs.get("ttl", None)
|
||||||
# Convert value to JSON before storing in S3
|
# Convert value to JSON before storing in S3
|
||||||
serialized_value = json.dumps(value)
|
serialized_value = json.dumps(value)
|
||||||
|
key = self.key_prefix + key
|
||||||
|
|
||||||
if ttl is not None:
|
if ttl is not None:
|
||||||
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
|
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -171,7 +177,7 @@ class S3Cache(BaseCache):
|
||||||
CacheControl=cache_control,
|
CacheControl=cache_control,
|
||||||
ContentType="application/json",
|
ContentType="application/json",
|
||||||
ContentLanguage="en",
|
ContentLanguage="en",
|
||||||
ContentDisposition=f"inline; filename=\"{key}.json\""
|
ContentDisposition=f'inline; filename="{key}.json"',
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cache_control = "immutable, max-age=31536000, s-maxage=31536000"
|
cache_control = "immutable, max-age=31536000, s-maxage=31536000"
|
||||||
|
@ -183,7 +189,7 @@ class S3Cache(BaseCache):
|
||||||
CacheControl=cache_control,
|
CacheControl=cache_control,
|
||||||
ContentType="application/json",
|
ContentType="application/json",
|
||||||
ContentLanguage="en",
|
ContentLanguage="en",
|
||||||
ContentDisposition=f"inline; filename=\"{key}.json\""
|
ContentDisposition=f'inline; filename="{key}.json"',
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# NON blocking - notify users S3 is throwing an exception
|
# NON blocking - notify users S3 is throwing an exception
|
||||||
|
@ -193,6 +199,8 @@ class S3Cache(BaseCache):
|
||||||
import boto3, botocore
|
import boto3, botocore
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
key = self.key_prefix + key
|
||||||
|
|
||||||
print_verbose(f"Get S3 Cache: key: {key}")
|
print_verbose(f"Get S3 Cache: key: {key}")
|
||||||
# Download the data from S3
|
# Download the data from S3
|
||||||
cached_response = self.s3_client.get_object(
|
cached_response = self.s3_client.get_object(
|
||||||
|
|
|
@ -8,6 +8,8 @@ from datetime import datetime
|
||||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||||
import traceback
|
import traceback
|
||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
|
from litellm._logging import verbose_logger
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
|
||||||
class LangFuseLogger:
|
class LangFuseLogger:
|
||||||
|
@ -33,6 +35,26 @@ class LangFuseLogger:
|
||||||
debug=self.langfuse_debug,
|
debug=self.langfuse_debug,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
|
||||||
|
self.upstream_langfuse_secret_key = os.getenv(
|
||||||
|
"UPSTREAM_LANGFUSE_SECRET_KEY"
|
||||||
|
)
|
||||||
|
self.upstream_langfuse_public_key = os.getenv(
|
||||||
|
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
|
||||||
|
)
|
||||||
|
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
|
||||||
|
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
|
||||||
|
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
|
||||||
|
self.upstream_langfuse = Langfuse(
|
||||||
|
public_key=self.upstream_langfuse_public_key,
|
||||||
|
secret_key=self.upstream_langfuse_secret_key,
|
||||||
|
host=self.upstream_langfuse_host,
|
||||||
|
release=self.upstream_langfuse_release,
|
||||||
|
debug=self.upstream_langfuse_debug,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.upstream_langfuse = None
|
||||||
|
|
||||||
def log_event(
|
def log_event(
|
||||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||||
):
|
):
|
||||||
|
@ -62,11 +84,15 @@ class LangFuseLogger:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# end of processing langfuse ########################
|
# end of processing langfuse ########################
|
||||||
input = prompt
|
if kwargs.get("call_type", None) == "embedding" or isinstance(
|
||||||
output = response_obj["choices"][0]["message"].json()
|
response_obj, litellm.EmbeddingResponse
|
||||||
print_verbose(
|
):
|
||||||
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
|
input = prompt
|
||||||
)
|
output = response_obj["data"]
|
||||||
|
else:
|
||||||
|
input = prompt
|
||||||
|
output = response_obj["choices"][0]["message"].json()
|
||||||
|
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
|
||||||
self._log_langfuse_v2(
|
self._log_langfuse_v2(
|
||||||
user_id,
|
user_id,
|
||||||
metadata,
|
metadata,
|
||||||
|
@ -77,6 +103,7 @@ class LangFuseLogger:
|
||||||
optional_params,
|
optional_params,
|
||||||
input,
|
input,
|
||||||
response_obj,
|
response_obj,
|
||||||
|
print_verbose,
|
||||||
) if self._is_langfuse_v2() else self._log_langfuse_v1(
|
) if self._is_langfuse_v2() else self._log_langfuse_v1(
|
||||||
user_id,
|
user_id,
|
||||||
metadata,
|
metadata,
|
||||||
|
@ -93,6 +120,7 @@ class LangFuseLogger:
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Langfuse Layer Logging - final response object: {response_obj}"
|
f"Langfuse Layer Logging - final response object: {response_obj}"
|
||||||
)
|
)
|
||||||
|
verbose_logger.info(f"Langfuse Layer Logging - logging success")
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
|
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||||
|
@ -165,28 +193,39 @@ class LangFuseLogger:
|
||||||
optional_params,
|
optional_params,
|
||||||
input,
|
input,
|
||||||
response_obj,
|
response_obj,
|
||||||
|
print_verbose,
|
||||||
):
|
):
|
||||||
import langfuse
|
import langfuse
|
||||||
|
|
||||||
tags = []
|
tags = []
|
||||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
||||||
|
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
|
||||||
|
|
||||||
|
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
|
||||||
|
|
||||||
|
generation_name = metadata.get("generation_name", None)
|
||||||
|
if generation_name is None:
|
||||||
|
# just log `litellm-{call_type}` as the generation name
|
||||||
|
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||||
trace_params = {
|
trace_params = {
|
||||||
"name": metadata.get("generation_name", "litellm-completion"),
|
"name": generation_name,
|
||||||
"input": input,
|
"input": input,
|
||||||
"output": output,
|
"output": output,
|
||||||
"user_id": metadata.get("trace_user_id", user_id),
|
"user_id": metadata.get("trace_user_id", user_id),
|
||||||
"id": metadata.get("trace_id", None),
|
"id": metadata.get("trace_id", None),
|
||||||
}
|
}
|
||||||
|
cost = kwargs["response_cost"]
|
||||||
|
print_verbose(f"trace: {cost}")
|
||||||
if supports_tags:
|
if supports_tags:
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
tags.append(f"{key}:{value}")
|
tags.append(f"{key}:{value}")
|
||||||
|
if "cache_hit" in kwargs:
|
||||||
|
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
||||||
trace_params.update({"tags": tags})
|
trace_params.update({"tags": tags})
|
||||||
|
|
||||||
trace = self.Langfuse.trace(**trace_params)
|
trace = self.Langfuse.trace(**trace_params)
|
||||||
|
|
||||||
trace.generation(
|
trace.generation(
|
||||||
name=metadata.get("generation_name", "litellm-completion"),
|
name=generation_name,
|
||||||
id=metadata.get("generation_id", None),
|
id=metadata.get("generation_id", None),
|
||||||
startTime=start_time,
|
startTime=start_time,
|
||||||
endTime=end_time,
|
endTime=end_time,
|
||||||
|
@ -197,6 +236,30 @@ class LangFuseLogger:
|
||||||
usage={
|
usage={
|
||||||
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
||||||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||||
|
"total_cost": cost if supports_costs else None,
|
||||||
},
|
},
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.upstream_langfuse:
|
||||||
|
# user wants to log RAW LLM API call in 2nd langfuse project
|
||||||
|
# key change - model=response_obj["model"], instead of input model used
|
||||||
|
# this is useful for litellm proxy, where users need to see analytics on their LLM Endpoints
|
||||||
|
|
||||||
|
trace = self.upstream_langfuse.trace(**trace_params)
|
||||||
|
|
||||||
|
trace.generation(
|
||||||
|
name=generation_name,
|
||||||
|
id=metadata.get("generation_id", None),
|
||||||
|
startTime=start_time,
|
||||||
|
endTime=end_time,
|
||||||
|
model=response_obj["model"],
|
||||||
|
modelParameters=optional_params,
|
||||||
|
input=input,
|
||||||
|
output=output,
|
||||||
|
usage={
|
||||||
|
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
||||||
|
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||||
|
},
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
|
@ -13,19 +13,22 @@ class LangsmithLogger:
|
||||||
# Class variables or attributes
|
# Class variables or attributes
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
|
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
|
||||||
|
self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
|
||||||
|
self.langsmith_default_run_name = os.getenv(
|
||||||
|
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
|
||||||
|
)
|
||||||
|
|
||||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||||
# Method definition
|
# Method definition
|
||||||
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
|
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
|
||||||
metadata = {}
|
metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {} # if metadata is None
|
||||||
if "litellm_params" in kwargs:
|
|
||||||
metadata = kwargs["litellm_params"].get("metadata", {})
|
|
||||||
# set project name and run_name for langsmith logging
|
# set project name and run_name for langsmith logging
|
||||||
# users can pass project_name and run name to litellm.completion()
|
# users can pass project_name and run name to litellm.completion()
|
||||||
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
|
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
|
||||||
# if not set litellm will use default project_name = litellm-completion, run_name = LLMRun
|
# if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
|
||||||
project_name = metadata.get("project_name", "litellm-completion")
|
project_name = metadata.get("project_name", self.langsmith_project)
|
||||||
run_name = metadata.get("run_name", "LLMRun")
|
run_name = metadata.get("run_name", self.langsmith_default_run_name)
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
|
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
|
||||||
)
|
)
|
||||||
|
|
|
@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
|
||||||
import traceback
|
import traceback
|
||||||
import datetime, subprocess, sys
|
import datetime, subprocess, sys
|
||||||
import litellm, uuid
|
import litellm, uuid
|
||||||
from litellm._logging import print_verbose
|
from litellm._logging import print_verbose, verbose_logger
|
||||||
|
|
||||||
|
|
||||||
class S3Logger:
|
class S3Logger:
|
||||||
|
@ -31,7 +31,9 @@ class S3Logger:
|
||||||
import boto3
|
import boto3
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print_verbose("in init s3 logger")
|
verbose_logger.debug(
|
||||||
|
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
|
||||||
|
)
|
||||||
|
|
||||||
if litellm.s3_callback_params is not None:
|
if litellm.s3_callback_params is not None:
|
||||||
# read in .env variables - example os.environ/AWS_BUCKET_NAME
|
# read in .env variables - example os.environ/AWS_BUCKET_NAME
|
||||||
|
@ -42,7 +44,7 @@ class S3Logger:
|
||||||
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
|
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
|
||||||
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
|
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
|
||||||
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
|
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
|
||||||
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
|
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
|
||||||
s3_verify = litellm.s3_callback_params.get("s3_verify")
|
s3_verify = litellm.s3_callback_params.get("s3_verify")
|
||||||
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
|
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
|
||||||
s3_aws_access_key_id = litellm.s3_callback_params.get(
|
s3_aws_access_key_id = litellm.s3_callback_params.get(
|
||||||
|
@ -59,6 +61,7 @@ class S3Logger:
|
||||||
|
|
||||||
self.bucket_name = s3_bucket_name
|
self.bucket_name = s3_bucket_name
|
||||||
self.s3_path = s3_path
|
self.s3_path = s3_path
|
||||||
|
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
|
||||||
# Create an S3 client with custom endpoint URL
|
# Create an S3 client with custom endpoint URL
|
||||||
self.s3_client = boto3.client(
|
self.s3_client = boto3.client(
|
||||||
"s3",
|
"s3",
|
||||||
|
@ -84,7 +87,9 @@ class S3Logger:
|
||||||
|
|
||||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||||
try:
|
try:
|
||||||
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
|
verbose_logger.debug(
|
||||||
|
f"s3 Logging - Enters logging function for model {kwargs}"
|
||||||
|
)
|
||||||
|
|
||||||
# construct payload to send to s3
|
# construct payload to send to s3
|
||||||
# follows the same params as langfuse.py
|
# follows the same params as langfuse.py
|
||||||
|
@ -129,6 +134,7 @@ class S3Logger:
|
||||||
+ "-time="
|
+ "-time="
|
||||||
+ str(start_time)
|
+ str(start_time)
|
||||||
) # we need the s3 key to include the time, so we log cache hits too
|
) # we need the s3 key to include the time, so we log cache hits too
|
||||||
|
s3_object_key += ".json"
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
@ -151,5 +157,5 @@ class S3Logger:
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -78,7 +78,7 @@ class AnthropicConfig:
|
||||||
|
|
||||||
|
|
||||||
# makes headers for API call
|
# makes headers for API call
|
||||||
def validate_environment(api_key):
|
def validate_environment(api_key, user_headers):
|
||||||
if api_key is None:
|
if api_key is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
|
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
|
||||||
|
@ -89,6 +89,8 @@ def validate_environment(api_key):
|
||||||
"content-type": "application/json",
|
"content-type": "application/json",
|
||||||
"x-api-key": api_key,
|
"x-api-key": api_key,
|
||||||
}
|
}
|
||||||
|
if user_headers is not None and isinstance(user_headers, dict):
|
||||||
|
headers = {**headers, **user_headers}
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,8 +107,9 @@ def completion(
|
||||||
optional_params=None,
|
optional_params=None,
|
||||||
litellm_params=None,
|
litellm_params=None,
|
||||||
logger_fn=None,
|
logger_fn=None,
|
||||||
|
headers={},
|
||||||
):
|
):
|
||||||
headers = validate_environment(api_key)
|
headers = validate_environment(api_key, headers)
|
||||||
if model in custom_prompt_dict:
|
if model in custom_prompt_dict:
|
||||||
# check if the model has a registered custom prompt
|
# check if the model has a registered custom prompt
|
||||||
model_prompt_details = custom_prompt_dict[model]
|
model_prompt_details = custom_prompt_dict[model]
|
||||||
|
@ -139,7 +142,11 @@ def completion(
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=prompt,
|
input=prompt,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
additional_args={"complete_input_dict": data, "api_base": api_base},
|
additional_args={
|
||||||
|
"complete_input_dict": data,
|
||||||
|
"api_base": api_base,
|
||||||
|
"headers": headers,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
## COMPLETION CALL
|
## COMPLETION CALL
|
||||||
|
|
|
@ -629,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
|
||||||
client_session = litellm.aclient_session or httpx.AsyncClient(
|
client_session = litellm.aclient_session or httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(),
|
||||||
)
|
)
|
||||||
openai_aclient = AsyncAzureOpenAI(
|
azure_client = AsyncAzureOpenAI(
|
||||||
http_client=client_session, **azure_client_params
|
http_client=client_session, **azure_client_params
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_aclient = client
|
azure_client = client
|
||||||
response = await openai_aclient.images.generate(**data, timeout=timeout)
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=data["prompt"],
|
||||||
|
api_key=azure_client.api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {"api_key": azure_client.api_key},
|
||||||
|
"api_base": azure_client._base_url._uri_reference,
|
||||||
|
"acompletion": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response = await azure_client.images.generate(**data, timeout=timeout)
|
||||||
stringified_response = response.model_dump()
|
stringified_response = response.model_dump()
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
|
@ -719,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
|
||||||
input=prompt,
|
input=prompt,
|
||||||
api_key=azure_client.api_key,
|
api_key=azure_client.api_key,
|
||||||
additional_args={
|
additional_args={
|
||||||
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
|
"headers": {"api_key": azure_client.api_key},
|
||||||
"api_base": azure_client._base_url._uri_reference,
|
"api_base": azure_client._base_url._uri_reference,
|
||||||
"acompletion": False,
|
"acompletion": False,
|
||||||
"complete_input_dict": data,
|
"complete_input_dict": data,
|
||||||
|
|
|
@ -659,9 +659,16 @@ def completion(
|
||||||
)
|
)
|
||||||
|
|
||||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
prompt_tokens = len(encoding.encode(prompt))
|
prompt_tokens = response_metadata.get(
|
||||||
completion_tokens = len(
|
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
)
|
||||||
|
completion_tokens = response_metadata.get(
|
||||||
|
"x-amzn-bedrock-output-token-count",
|
||||||
|
len(
|
||||||
|
encoding.encode(
|
||||||
|
model_response["choices"][0]["message"].get("content", "")
|
||||||
|
)
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
|
@ -672,6 +679,8 @@ def completion(
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
)
|
)
|
||||||
model_response.usage = usage
|
model_response.usage = usage
|
||||||
|
model_response._hidden_params["region_name"] = client.meta.region_name
|
||||||
|
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
||||||
return model_response
|
return model_response
|
||||||
except BedrockError as e:
|
except BedrockError as e:
|
||||||
exception_mapping_worked = True
|
exception_mapping_worked = True
|
||||||
|
@ -693,6 +702,11 @@ def _embedding_func_single(
|
||||||
encoding=None,
|
encoding=None,
|
||||||
logging_obj=None,
|
logging_obj=None,
|
||||||
):
|
):
|
||||||
|
if isinstance(input, str) is False:
|
||||||
|
raise BedrockError(
|
||||||
|
message="Bedrock Embedding API input must be type str | List[str]",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
# logic for parsing in - calling - parsing out model embedding calls
|
# logic for parsing in - calling - parsing out model embedding calls
|
||||||
## FORMAT EMBEDDING INPUT ##
|
## FORMAT EMBEDDING INPUT ##
|
||||||
provider = model.split(".")[0]
|
provider = model.split(".")[0]
|
||||||
|
@ -786,7 +800,8 @@ def embedding(
|
||||||
aws_role_name=aws_role_name,
|
aws_role_name=aws_role_name,
|
||||||
aws_session_name=aws_session_name,
|
aws_session_name=aws_session_name,
|
||||||
)
|
)
|
||||||
if type(input) == str:
|
if isinstance(input, str):
|
||||||
|
## Embedding Call
|
||||||
embeddings = [
|
embeddings = [
|
||||||
_embedding_func_single(
|
_embedding_func_single(
|
||||||
model,
|
model,
|
||||||
|
@ -796,8 +811,8 @@ def embedding(
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
else:
|
elif isinstance(input, list):
|
||||||
## Embedding Call
|
## Embedding Call - assuming this is a List[str]
|
||||||
embeddings = [
|
embeddings = [
|
||||||
_embedding_func_single(
|
_embedding_func_single(
|
||||||
model,
|
model,
|
||||||
|
@ -808,6 +823,12 @@ def embedding(
|
||||||
)
|
)
|
||||||
for i in input
|
for i in input
|
||||||
] # [TODO]: make these parallel calls
|
] # [TODO]: make these parallel calls
|
||||||
|
else:
|
||||||
|
# enters this branch if input = int, ex. input=2
|
||||||
|
raise BedrockError(
|
||||||
|
message="Bedrock Embedding API input must be type str | List[str]",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
## Populate OpenAI compliant dictionary
|
## Populate OpenAI compliant dictionary
|
||||||
embedding_response = []
|
embedding_response = []
|
||||||
|
|
|
@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
|
||||||
request=request,
|
request=request,
|
||||||
)
|
)
|
||||||
|
|
||||||
time.sleep(int(response.headers.get("retry-after")) or 10)
|
await asyncio.sleep(int(response.headers.get("retry-after") or 10))
|
||||||
response = await super().handle_async_request(request)
|
response = await super().handle_async_request(request)
|
||||||
await response.aread()
|
await response.aread()
|
||||||
|
|
||||||
|
@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
|
||||||
request.method = "GET"
|
request.method = "GET"
|
||||||
response = super().handle_request(request)
|
response = super().handle_request(request)
|
||||||
response.read()
|
response.read()
|
||||||
|
|
||||||
timeout_secs: int = 120
|
timeout_secs: int = 120
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
while response.json()["status"] not in ["succeeded", "failed"]:
|
while response.json()["status"] not in ["succeeded", "failed"]:
|
||||||
|
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
|
||||||
content=json.dumps(timeout).encode("utf-8"),
|
content=json.dumps(timeout).encode("utf-8"),
|
||||||
request=request,
|
request=request,
|
||||||
)
|
)
|
||||||
|
time.sleep(int(response.headers.get("retry-after", None) or 10))
|
||||||
time.sleep(int(response.headers.get("retry-after")) or 10)
|
|
||||||
response = super().handle_request(request)
|
response = super().handle_request(request)
|
||||||
response.read()
|
response.read()
|
||||||
|
|
||||||
if response.json()["status"] == "failed":
|
if response.json()["status"] == "failed":
|
||||||
error_data = response.json()
|
error_data = response.json()
|
||||||
return httpx.Response(
|
return httpx.Response(
|
||||||
|
|
|
@ -120,9 +120,7 @@ def completion(
|
||||||
|
|
||||||
## Load Config
|
## Load Config
|
||||||
inference_params = copy.deepcopy(optional_params)
|
inference_params = copy.deepcopy(optional_params)
|
||||||
inference_params.pop(
|
stream = inference_params.pop("stream", None)
|
||||||
"stream", None
|
|
||||||
) # palm does not support streaming, so we handle this by fake streaming in main.py
|
|
||||||
config = litellm.GeminiConfig.get_config()
|
config = litellm.GeminiConfig.get_config()
|
||||||
for k, v in config.items():
|
for k, v in config.items():
|
||||||
if (
|
if (
|
||||||
|
@ -139,10 +137,18 @@ def completion(
|
||||||
## COMPLETION CALL
|
## COMPLETION CALL
|
||||||
try:
|
try:
|
||||||
_model = genai.GenerativeModel(f"models/{model}")
|
_model = genai.GenerativeModel(f"models/{model}")
|
||||||
response = _model.generate_content(
|
if stream != True:
|
||||||
contents=prompt,
|
response = _model.generate_content(
|
||||||
generation_config=genai.types.GenerationConfig(**inference_params),
|
contents=prompt,
|
||||||
)
|
generation_config=genai.types.GenerationConfig(**inference_params),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = _model.generate_content(
|
||||||
|
contents=prompt,
|
||||||
|
generation_config=genai.types.GenerationConfig(**inference_params),
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise GeminiError(
|
raise GeminiError(
|
||||||
message=str(e),
|
message=str(e),
|
||||||
|
@ -177,16 +183,20 @@ def completion(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
completion_response = model_response["choices"][0]["message"].get("content")
|
completion_response = model_response["choices"][0]["message"].get("content")
|
||||||
if completion_response is None:
|
if completion_response is None:
|
||||||
raise Exception
|
raise Exception
|
||||||
except:
|
except:
|
||||||
original_response = f"response: {response}"
|
original_response = f"response: {response}"
|
||||||
if hasattr(response, "candidates"):
|
if hasattr(response, "candidates"):
|
||||||
original_response = f"response: {response.candidates}"
|
original_response = f"response: {response.candidates}"
|
||||||
if "SAFETY" in original_response:
|
if "SAFETY" in original_response:
|
||||||
original_response += "\nThe candidate content was flagged for safety reasons."
|
original_response += (
|
||||||
|
"\nThe candidate content was flagged for safety reasons."
|
||||||
|
)
|
||||||
elif "RECITATION" in original_response:
|
elif "RECITATION" in original_response:
|
||||||
original_response += "\nThe candidate content was flagged for recitation reasons."
|
original_response += (
|
||||||
|
"\nThe candidate content was flagged for recitation reasons."
|
||||||
|
)
|
||||||
raise GeminiError(
|
raise GeminiError(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
message=f"No response received. Original response - {original_response}",
|
message=f"No response received. Original response - {original_response}",
|
||||||
|
|
|
@ -145,8 +145,8 @@ def get_ollama_response(
|
||||||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
optional_params[k] = v
|
optional_params[k] = v
|
||||||
|
|
||||||
optional_params["stream"] = optional_params.get("stream", False)
|
stream = optional_params.pop("stream", False)
|
||||||
data = {"model": model, "messages": messages, **optional_params}
|
data = {"model": model, "messages": messages, "options": optional_params}
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=None,
|
input=None,
|
||||||
|
@ -159,7 +159,7 @@ def get_ollama_response(
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
if acompletion is True:
|
if acompletion is True:
|
||||||
if optional_params.get("stream", False) == True:
|
if stream == True:
|
||||||
response = ollama_async_streaming(
|
response = ollama_async_streaming(
|
||||||
url=url,
|
url=url,
|
||||||
data=data,
|
data=data,
|
||||||
|
@ -176,7 +176,7 @@ def get_ollama_response(
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
elif optional_params.get("stream", False) == True:
|
elif stream == True:
|
||||||
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
||||||
|
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
|
@ -220,8 +220,10 @@ def get_ollama_response(
|
||||||
model_response["choices"][0]["message"] = response_json["message"]
|
model_response["choices"][0]["message"] = response_json["message"]
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
model_response["model"] = "ollama/" + model
|
model_response["model"] = "ollama/" + model
|
||||||
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
|
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
|
||||||
completion_tokens = response_json["eval_count"]
|
completion_tokens = response_json.get(
|
||||||
|
"eval_count", litellm.token_counter(text=response_json["message"])
|
||||||
|
)
|
||||||
model_response["usage"] = litellm.Usage(
|
model_response["usage"] = litellm.Usage(
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
|
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
||||||
model_response["choices"][0]["message"] = response_json["message"]
|
model_response["choices"][0]["message"] = response_json["message"]
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
model_response["model"] = "ollama/" + data["model"]
|
model_response["model"] = "ollama/" + data["model"]
|
||||||
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
|
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
|
||||||
completion_tokens = response_json["eval_count"]
|
completion_tokens = response_json.get(
|
||||||
|
"eval_count", litellm.token_counter(text=response_json["message"])
|
||||||
|
)
|
||||||
model_response["usage"] = litellm.Usage(
|
model_response["usage"] = litellm.Usage(
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Optional, Union, Any
|
from typing import Optional, Union, Any
|
||||||
import types, time, json
|
import types, time, json, traceback
|
||||||
import httpx
|
import httpx
|
||||||
from .base import BaseLLM
|
from .base import BaseLLM
|
||||||
from litellm.utils import (
|
from litellm.utils import (
|
||||||
|
@ -221,6 +221,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
custom_prompt_dict: dict = {},
|
custom_prompt_dict: dict = {},
|
||||||
client=None,
|
client=None,
|
||||||
|
organization: Optional[str] = None,
|
||||||
):
|
):
|
||||||
super().completion()
|
super().completion()
|
||||||
exception_mapping_worked = False
|
exception_mapping_worked = False
|
||||||
|
@ -254,6 +255,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
client=client,
|
client=client,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return self.acompletion(
|
return self.acompletion(
|
||||||
|
@ -266,6 +268,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
client=client,
|
client=client,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
elif optional_params.get("stream", False):
|
elif optional_params.get("stream", False):
|
||||||
return self.streaming(
|
return self.streaming(
|
||||||
|
@ -278,6 +281,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
client=client,
|
client=client,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if not isinstance(max_retries, int):
|
if not isinstance(max_retries, int):
|
||||||
|
@ -291,6 +295,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
http_client=litellm.client_session,
|
http_client=litellm.client_session,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_client = client
|
openai_client = client
|
||||||
|
@ -349,7 +354,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
if hasattr(e, "status_code"):
|
if hasattr(e, "status_code"):
|
||||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||||
else:
|
else:
|
||||||
raise OpenAIError(status_code=500, message=str(e))
|
raise OpenAIError(status_code=500, message=traceback.format_exc())
|
||||||
|
|
||||||
async def acompletion(
|
async def acompletion(
|
||||||
self,
|
self,
|
||||||
|
@ -358,6 +363,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
timeout: float,
|
timeout: float,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
organization: Optional[str] = None,
|
||||||
client=None,
|
client=None,
|
||||||
max_retries=None,
|
max_retries=None,
|
||||||
logging_obj=None,
|
logging_obj=None,
|
||||||
|
@ -372,6 +378,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
http_client=litellm.aclient_session,
|
http_client=litellm.aclient_session,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_aclient = client
|
openai_aclient = client
|
||||||
|
@ -412,6 +419,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
model: str,
|
model: str,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
organization: Optional[str] = None,
|
||||||
client=None,
|
client=None,
|
||||||
max_retries=None,
|
max_retries=None,
|
||||||
headers=None,
|
headers=None,
|
||||||
|
@ -423,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
http_client=litellm.client_session,
|
http_client=litellm.client_session,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_client = client
|
openai_client = client
|
||||||
|
@ -454,6 +463,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
model: str,
|
model: str,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
organization: Optional[str] = None,
|
||||||
client=None,
|
client=None,
|
||||||
max_retries=None,
|
max_retries=None,
|
||||||
headers=None,
|
headers=None,
|
||||||
|
@ -467,6 +477,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
http_client=litellm.aclient_session,
|
http_client=litellm.aclient_session,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_aclient = client
|
openai_aclient = client
|
||||||
|
@ -706,19 +717,34 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
|
|
||||||
## COMPLETION CALL
|
## COMPLETION CALL
|
||||||
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
|
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
|
||||||
|
response = response.model_dump() # type: ignore
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
input=input,
|
input=prompt,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
additional_args={"complete_input_dict": data},
|
additional_args={"complete_input_dict": data},
|
||||||
original_response=response,
|
original_response=response,
|
||||||
)
|
)
|
||||||
# return response
|
# return response
|
||||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
|
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||||
except OpenAIError as e:
|
except OpenAIError as e:
|
||||||
exception_mapping_worked = True
|
exception_mapping_worked = True
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
original_response=str(e),
|
||||||
|
)
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
original_response=str(e),
|
||||||
|
)
|
||||||
if hasattr(e, "status_code"):
|
if hasattr(e, "status_code"):
|
||||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||||
else:
|
else:
|
||||||
|
@ -733,8 +759,11 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
messages: Optional[list] = None,
|
messages: Optional[list] = None,
|
||||||
input: Optional[list] = None,
|
input: Optional[list] = None,
|
||||||
prompt: Optional[str] = None,
|
prompt: Optional[str] = None,
|
||||||
|
organization: Optional[str] = None,
|
||||||
):
|
):
|
||||||
client = AsyncOpenAI(api_key=api_key, timeout=timeout)
|
client = AsyncOpenAI(
|
||||||
|
api_key=api_key, timeout=timeout, organization=organization
|
||||||
|
)
|
||||||
if model is None and mode != "image_generation":
|
if model is None and mode != "image_generation":
|
||||||
raise Exception("model is not set")
|
raise Exception("model is not set")
|
||||||
|
|
||||||
|
|
|
@ -99,12 +99,16 @@ def ollama_pt(
|
||||||
|
|
||||||
|
|
||||||
def mistral_instruct_pt(messages):
|
def mistral_instruct_pt(messages):
|
||||||
|
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
|
||||||
prompt = custom_prompt(
|
prompt = custom_prompt(
|
||||||
initial_prompt_value="<s>",
|
initial_prompt_value="<s>",
|
||||||
role_dict={
|
role_dict={
|
||||||
"system": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
"system": {
|
||||||
"user": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
"pre_message": "[INST] \n",
|
||||||
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
"post_message": " [/INST]\n",
|
||||||
|
},
|
||||||
|
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
|
||||||
|
"assistant": {"pre_message": " ", "post_message": " "},
|
||||||
},
|
},
|
||||||
final_prompt_value="</s>",
|
final_prompt_value="</s>",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -372,6 +376,7 @@ def anthropic_pt(
|
||||||
You can "put words in Claude's mouth" by ending with an assistant message.
|
You can "put words in Claude's mouth" by ending with an assistant message.
|
||||||
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
|
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class AnthropicConstants(Enum):
|
class AnthropicConstants(Enum):
|
||||||
HUMAN_PROMPT = "\n\nHuman: "
|
HUMAN_PROMPT = "\n\nHuman: "
|
||||||
AI_PROMPT = "\n\nAssistant: "
|
AI_PROMPT = "\n\nAssistant: "
|
||||||
|
@ -394,32 +399,35 @@ def anthropic_pt(
|
||||||
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
|
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
def _load_image_from_url(image_url):
|
def _load_image_from_url(image_url):
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
except:
|
except:
|
||||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
raise Exception(
|
||||||
|
"gemini image conversion failed please run `pip install Pillow`"
|
||||||
|
)
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Send a GET request to the image URL
|
# Send a GET request to the image URL
|
||||||
response = requests.get(image_url)
|
response = requests.get(image_url)
|
||||||
response.raise_for_status() # Raise an exception for HTTP errors
|
response.raise_for_status() # Raise an exception for HTTP errors
|
||||||
|
|
||||||
# Check the response's content type to ensure it is an image
|
# Check the response's content type to ensure it is an image
|
||||||
content_type = response.headers.get('content-type')
|
content_type = response.headers.get("content-type")
|
||||||
if not content_type or 'image' not in content_type:
|
if not content_type or "image" not in content_type:
|
||||||
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
|
raise ValueError(
|
||||||
|
f"URL does not point to a valid image (content-type: {content_type})"
|
||||||
|
)
|
||||||
|
|
||||||
# Load the image from the response content
|
# Load the image from the response content
|
||||||
return Image.open(BytesIO(response.content))
|
return Image.open(BytesIO(response.content))
|
||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"Request failed: {e}")
|
raise Exception(f"Request failed: {e}")
|
||||||
except UnidentifiedImageError:
|
except Exception as e:
|
||||||
print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
|
raise e
|
||||||
except ValueError as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
def _gemini_vision_convert_messages(messages: list):
|
def _gemini_vision_convert_messages(messages: list):
|
||||||
|
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
except:
|
except:
|
||||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
raise Exception(
|
||||||
|
"gemini image conversion failed please run `pip install Pillow`"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# given messages for gpt-4 vision, convert them for gemini
|
# given messages for gpt-4 vision, convert them for gemini
|
||||||
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
|
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
|
||||||
prompt = ""
|
prompt = ""
|
||||||
|
@ -589,7 +598,7 @@ def prompt_factory(
|
||||||
if custom_llm_provider == "ollama":
|
if custom_llm_provider == "ollama":
|
||||||
return ollama_pt(model=model, messages=messages)
|
return ollama_pt(model=model, messages=messages)
|
||||||
elif custom_llm_provider == "anthropic":
|
elif custom_llm_provider == "anthropic":
|
||||||
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
|
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
|
||||||
return claude_2_1_pt(messages=messages)
|
return claude_2_1_pt(messages=messages)
|
||||||
else:
|
else:
|
||||||
return anthropic_pt(messages=messages)
|
return anthropic_pt(messages=messages)
|
||||||
|
|
|
@ -25,6 +25,46 @@ class SagemakerError(Exception):
|
||||||
) # Call the base class constructor with the parameters it needs
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class TokenIterator:
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.byte_iterator = iter(stream)
|
||||||
|
self.buffer = io.BytesIO()
|
||||||
|
self.read_pos = 0
|
||||||
|
self.end_of_data = False
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
self.buffer.seek(self.read_pos)
|
||||||
|
line = self.buffer.readline()
|
||||||
|
if line and line[-1] == ord("\n"):
|
||||||
|
response_obj = {"text": "", "is_finished": False}
|
||||||
|
self.read_pos += len(line) + 1
|
||||||
|
full_line = line[:-1].decode("utf-8")
|
||||||
|
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||||
|
if line_data.get("generated_text", None) is not None:
|
||||||
|
self.end_of_data = True
|
||||||
|
response_obj["is_finished"] = True
|
||||||
|
response_obj["text"] = line_data["token"]["text"]
|
||||||
|
return response_obj
|
||||||
|
chunk = next(self.byte_iterator)
|
||||||
|
self.buffer.seek(0, io.SEEK_END)
|
||||||
|
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||||
|
except StopIteration as e:
|
||||||
|
if self.end_of_data == True:
|
||||||
|
raise e # Re-raise StopIteration
|
||||||
|
else:
|
||||||
|
self.end_of_data = True
|
||||||
|
return "data: [DONE]"
|
||||||
|
|
||||||
|
|
||||||
class SagemakerConfig:
|
class SagemakerConfig:
|
||||||
"""
|
"""
|
||||||
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
|
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
|
||||||
|
@ -121,7 +161,6 @@ def completion(
|
||||||
|
|
||||||
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
|
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
|
||||||
inference_params = deepcopy(optional_params)
|
inference_params = deepcopy(optional_params)
|
||||||
inference_params.pop("stream", None)
|
|
||||||
|
|
||||||
## Load Config
|
## Load Config
|
||||||
config = litellm.SagemakerConfig.get_config()
|
config = litellm.SagemakerConfig.get_config()
|
||||||
|
@ -152,6 +191,28 @@ def completion(
|
||||||
hf_model_name or model
|
hf_model_name or model
|
||||||
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
|
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
|
||||||
prompt = prompt_factory(model=hf_model_name, messages=messages)
|
prompt = prompt_factory(model=hf_model_name, messages=messages)
|
||||||
|
stream = inference_params.pop("stream", None)
|
||||||
|
if stream == True:
|
||||||
|
data = json.dumps(
|
||||||
|
{"inputs": prompt, "parameters": inference_params, "stream": True}
|
||||||
|
).encode("utf-8")
|
||||||
|
## LOGGING
|
||||||
|
request_str = f"""
|
||||||
|
response = client.invoke_endpoint_with_response_stream(
|
||||||
|
EndpointName={model},
|
||||||
|
ContentType="application/json",
|
||||||
|
Body={data},
|
||||||
|
CustomAttributes="accept_eula=true",
|
||||||
|
)
|
||||||
|
""" # type: ignore
|
||||||
|
response = client.invoke_endpoint_with_response_stream(
|
||||||
|
EndpointName=model,
|
||||||
|
ContentType="application/json",
|
||||||
|
Body=data,
|
||||||
|
CustomAttributes="accept_eula=true",
|
||||||
|
)
|
||||||
|
|
||||||
|
return response["Body"]
|
||||||
|
|
||||||
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
|
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
|
||||||
"utf-8"
|
"utf-8"
|
||||||
|
|
0
litellm/llms/tokenizers/__init__.py
Normal file
|
@ -237,8 +237,11 @@ def completion(
|
||||||
GenerationConfig,
|
GenerationConfig,
|
||||||
)
|
)
|
||||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
|
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
|
||||||
|
import google.auth
|
||||||
|
|
||||||
vertexai.init(project=vertex_project, location=vertex_location)
|
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||||
|
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||||
|
vertexai.init(project=vertex_project, location=vertex_location, credentials=creds)
|
||||||
|
|
||||||
## Load Config
|
## Load Config
|
||||||
config = litellm.VertexAIConfig.get_config()
|
config = litellm.VertexAIConfig.get_config()
|
||||||
|
|
187
litellm/main.py
|
@ -10,12 +10,11 @@
|
||||||
import os, openai, sys, json, inspect, uuid, datetime, threading
|
import os, openai, sys, json, inspect, uuid, datetime, threading
|
||||||
from typing import Any, Literal, Union
|
from typing import Any, Literal, Union
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import dotenv, traceback, random, asyncio, time, contextvars
|
import dotenv, traceback, random, asyncio, time, contextvars
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
import httpx
|
import httpx
|
||||||
import litellm
|
import litellm
|
||||||
|
from ._logging import verbose_logger
|
||||||
from litellm import ( # type: ignore
|
from litellm import ( # type: ignore
|
||||||
client,
|
client,
|
||||||
exception_type,
|
exception_type,
|
||||||
|
@ -83,6 +82,7 @@ from litellm.utils import (
|
||||||
TextCompletionResponse,
|
TextCompletionResponse,
|
||||||
TextChoices,
|
TextChoices,
|
||||||
EmbeddingResponse,
|
EmbeddingResponse,
|
||||||
|
ImageResponse,
|
||||||
read_config_args,
|
read_config_args,
|
||||||
Choices,
|
Choices,
|
||||||
Message,
|
Message,
|
||||||
|
@ -273,14 +273,10 @@ async def acompletion(
|
||||||
else:
|
else:
|
||||||
# Call the synchronous function using run_in_executor
|
# Call the synchronous function using run_in_executor
|
||||||
response = await loop.run_in_executor(None, func_with_context) # type: ignore
|
response = await loop.run_in_executor(None, func_with_context) # type: ignore
|
||||||
# if kwargs.get("stream", False): # return an async generator
|
if isinstance(response, CustomStreamWrapper):
|
||||||
# return _async_streaming(
|
response.set_logging_event_loop(
|
||||||
# response=response,
|
loop=loop
|
||||||
# model=model,
|
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
|
||||||
# custom_llm_provider=custom_llm_provider,
|
|
||||||
# args=args,
|
|
||||||
# )
|
|
||||||
# else:
|
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
custom_llm_provider = custom_llm_provider or "openai"
|
custom_llm_provider = custom_llm_provider or "openai"
|
||||||
|
@ -343,6 +339,18 @@ def mock_completion(
|
||||||
model_response["choices"][0]["message"]["content"] = mock_response
|
model_response["choices"][0]["message"]["content"] = mock_response
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
model_response["model"] = model
|
model_response["model"] = model
|
||||||
|
|
||||||
|
model_response.usage = Usage(
|
||||||
|
prompt_tokens=10, completion_tokens=20, total_tokens=30
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
|
||||||
|
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||||
|
except:
|
||||||
|
# dont let setting a hidden param block a mock_respose
|
||||||
|
pass
|
||||||
|
|
||||||
return model_response
|
return model_response
|
||||||
|
|
||||||
except:
|
except:
|
||||||
|
@ -442,9 +450,12 @@ def completion(
|
||||||
num_retries = kwargs.get("num_retries", None) ## deprecated
|
num_retries = kwargs.get("num_retries", None) ## deprecated
|
||||||
max_retries = kwargs.get("max_retries", None)
|
max_retries = kwargs.get("max_retries", None)
|
||||||
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
|
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
|
||||||
|
organization = kwargs.get("organization", None)
|
||||||
### CUSTOM MODEL COST ###
|
### CUSTOM MODEL COST ###
|
||||||
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
||||||
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
||||||
|
input_cost_per_second = kwargs.get("input_cost_per_second", None)
|
||||||
|
output_cost_per_second = kwargs.get("output_cost_per_second", None)
|
||||||
### CUSTOM PROMPT TEMPLATE ###
|
### CUSTOM PROMPT TEMPLATE ###
|
||||||
initial_prompt_value = kwargs.get("initial_prompt_value", None)
|
initial_prompt_value = kwargs.get("initial_prompt_value", None)
|
||||||
roles = kwargs.get("roles", None)
|
roles = kwargs.get("roles", None)
|
||||||
|
@ -522,6 +533,8 @@ def completion(
|
||||||
"tpm",
|
"tpm",
|
||||||
"input_cost_per_token",
|
"input_cost_per_token",
|
||||||
"output_cost_per_token",
|
"output_cost_per_token",
|
||||||
|
"input_cost_per_second",
|
||||||
|
"output_cost_per_second",
|
||||||
"hf_model_name",
|
"hf_model_name",
|
||||||
"model_info",
|
"model_info",
|
||||||
"proxy_server_request",
|
"proxy_server_request",
|
||||||
|
@ -534,10 +547,6 @@ def completion(
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
k: v for k, v in kwargs.items() if k not in default_params
|
k: v for k, v in kwargs.items() if k not in default_params
|
||||||
} # model-specific params - pass them straight to the model/provider
|
} # model-specific params - pass them straight to the model/provider
|
||||||
if mock_response:
|
|
||||||
return mock_completion(
|
|
||||||
model, messages, stream=stream, mock_response=mock_response
|
|
||||||
)
|
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
timeout = (
|
timeout = (
|
||||||
kwargs.get("request_timeout", None) or 600
|
kwargs.get("request_timeout", None) or 600
|
||||||
|
@ -577,15 +586,43 @@ def completion(
|
||||||
)
|
)
|
||||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||||
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||||
|
model_response._hidden_params["region_name"] = kwargs.get(
|
||||||
|
"aws_region_name", None
|
||||||
|
) # support region-based pricing for bedrock
|
||||||
|
|
||||||
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||||
if input_cost_per_token is not None and output_cost_per_token is not None:
|
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||||
|
print_verbose(f"Registering model={model} in model cost map")
|
||||||
litellm.register_model(
|
litellm.register_model(
|
||||||
{
|
{
|
||||||
|
f"{custom_llm_provider}/{model}": {
|
||||||
|
"input_cost_per_token": input_cost_per_token,
|
||||||
|
"output_cost_per_token": output_cost_per_token,
|
||||||
|
"litellm_provider": custom_llm_provider,
|
||||||
|
},
|
||||||
model: {
|
model: {
|
||||||
"input_cost_per_token": input_cost_per_token,
|
"input_cost_per_token": input_cost_per_token,
|
||||||
"output_cost_per_token": output_cost_per_token,
|
"output_cost_per_token": output_cost_per_token,
|
||||||
"litellm_provider": custom_llm_provider,
|
"litellm_provider": custom_llm_provider,
|
||||||
}
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
input_cost_per_second is not None
|
||||||
|
): # time based pricing just needs cost in place
|
||||||
|
output_cost_per_second = output_cost_per_second or 0.0
|
||||||
|
litellm.register_model(
|
||||||
|
{
|
||||||
|
f"{custom_llm_provider}/{model}": {
|
||||||
|
"input_cost_per_second": input_cost_per_second,
|
||||||
|
"output_cost_per_second": output_cost_per_second,
|
||||||
|
"litellm_provider": custom_llm_provider,
|
||||||
|
},
|
||||||
|
model: {
|
||||||
|
"input_cost_per_second": input_cost_per_second,
|
||||||
|
"output_cost_per_second": output_cost_per_second,
|
||||||
|
"litellm_provider": custom_llm_provider,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
|
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
|
||||||
|
@ -674,6 +711,10 @@ def completion(
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
litellm_params=litellm_params,
|
litellm_params=litellm_params,
|
||||||
)
|
)
|
||||||
|
if mock_response:
|
||||||
|
return mock_completion(
|
||||||
|
model, messages, stream=stream, mock_response=mock_response
|
||||||
|
)
|
||||||
if custom_llm_provider == "azure":
|
if custom_llm_provider == "azure":
|
||||||
# azure configs
|
# azure configs
|
||||||
api_type = get_secret("AZURE_API_TYPE") or "azure"
|
api_type = get_secret("AZURE_API_TYPE") or "azure"
|
||||||
|
@ -692,9 +733,9 @@ def completion(
|
||||||
or get_secret("AZURE_API_KEY")
|
or get_secret("AZURE_API_KEY")
|
||||||
)
|
)
|
||||||
|
|
||||||
azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret(
|
azure_ad_token = optional_params.get("extra_body", {}).pop(
|
||||||
"AZURE_AD_TOKEN"
|
"azure_ad_token", None
|
||||||
)
|
) or get_secret("AZURE_AD_TOKEN")
|
||||||
|
|
||||||
headers = headers or litellm.headers
|
headers = headers or litellm.headers
|
||||||
|
|
||||||
|
@ -758,7 +799,8 @@ def completion(
|
||||||
or "https://api.openai.com/v1"
|
or "https://api.openai.com/v1"
|
||||||
)
|
)
|
||||||
openai.organization = (
|
openai.organization = (
|
||||||
litellm.organization
|
organization
|
||||||
|
or litellm.organization
|
||||||
or get_secret("OPENAI_ORGANIZATION")
|
or get_secret("OPENAI_ORGANIZATION")
|
||||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||||
)
|
)
|
||||||
|
@ -798,6 +840,7 @@ def completion(
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
custom_prompt_dict=custom_prompt_dict,
|
custom_prompt_dict=custom_prompt_dict,
|
||||||
client=client, # pass AsyncOpenAI, OpenAI client
|
client=client, # pass AsyncOpenAI, OpenAI client
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
## LOGGING - log the original exception returned
|
## LOGGING - log the original exception returned
|
||||||
|
@ -967,6 +1010,7 @@ def completion(
|
||||||
encoding=encoding, # for calculating input/output tokens
|
encoding=encoding, # for calculating input/output tokens
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
logging_obj=logging,
|
logging_obj=logging,
|
||||||
|
headers=headers,
|
||||||
)
|
)
|
||||||
if "stream" in optional_params and optional_params["stream"] == True:
|
if "stream" in optional_params and optional_params["stream"] == True:
|
||||||
# don't try to access stream object,
|
# don't try to access stream object,
|
||||||
|
@ -1376,11 +1420,29 @@ def completion(
|
||||||
acompletion=acompletion,
|
acompletion=acompletion,
|
||||||
custom_prompt_dict=custom_prompt_dict,
|
custom_prompt_dict=custom_prompt_dict,
|
||||||
)
|
)
|
||||||
|
if (
|
||||||
|
"stream" in optional_params
|
||||||
|
and optional_params["stream"] == True
|
||||||
|
and acompletion == False
|
||||||
|
):
|
||||||
|
response = CustomStreamWrapper(
|
||||||
|
iter(model_response),
|
||||||
|
model,
|
||||||
|
custom_llm_provider="gemini",
|
||||||
|
logging_obj=logging,
|
||||||
|
)
|
||||||
|
return response
|
||||||
response = model_response
|
response = model_response
|
||||||
elif custom_llm_provider == "vertex_ai":
|
elif custom_llm_provider == "vertex_ai":
|
||||||
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
|
vertex_ai_project = (
|
||||||
vertex_ai_location = litellm.vertex_location or get_secret(
|
optional_params.pop("vertex_ai_project", None)
|
||||||
"VERTEXAI_LOCATION"
|
or litellm.vertex_project
|
||||||
|
or get_secret("VERTEXAI_PROJECT")
|
||||||
|
)
|
||||||
|
vertex_ai_location = (
|
||||||
|
optional_params.pop("vertex_ai_location", None)
|
||||||
|
or litellm.vertex_location
|
||||||
|
or get_secret("VERTEXAI_LOCATION")
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response = vertex_ai.completion(
|
model_response = vertex_ai.completion(
|
||||||
|
@ -1471,19 +1533,22 @@ def completion(
|
||||||
if (
|
if (
|
||||||
"stream" in optional_params and optional_params["stream"] == True
|
"stream" in optional_params and optional_params["stream"] == True
|
||||||
): ## [BETA]
|
): ## [BETA]
|
||||||
# sagemaker does not support streaming as of now so we're faking streaming:
|
|
||||||
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
|
|
||||||
# "SageMaker is currently not supporting streaming responses."
|
|
||||||
|
|
||||||
# fake streaming for sagemaker
|
|
||||||
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
|
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
|
||||||
resp_string = model_response["choices"][0]["message"]["content"]
|
from .llms.sagemaker import TokenIterator
|
||||||
|
|
||||||
|
tokenIterator = TokenIterator(model_response)
|
||||||
response = CustomStreamWrapper(
|
response = CustomStreamWrapper(
|
||||||
resp_string,
|
completion_stream=tokenIterator,
|
||||||
model,
|
model=model,
|
||||||
custom_llm_provider="sagemaker",
|
custom_llm_provider="sagemaker",
|
||||||
logging_obj=logging,
|
logging_obj=logging,
|
||||||
)
|
)
|
||||||
|
## LOGGING
|
||||||
|
logging.post_call(
|
||||||
|
input=messages,
|
||||||
|
api_key=None,
|
||||||
|
original_response=response,
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
|
@ -2176,6 +2241,7 @@ def embedding(
|
||||||
model,
|
model,
|
||||||
input=[],
|
input=[],
|
||||||
# Optional params
|
# Optional params
|
||||||
|
dimensions: Optional[int] = None,
|
||||||
timeout=600, # default to 10 minutes
|
timeout=600, # default to 10 minutes
|
||||||
# set api_base, api_version, api_key
|
# set api_base, api_version, api_key
|
||||||
api_base: Optional[str] = None,
|
api_base: Optional[str] = None,
|
||||||
|
@ -2196,6 +2262,7 @@ def embedding(
|
||||||
Parameters:
|
Parameters:
|
||||||
- model: The embedding model to use.
|
- model: The embedding model to use.
|
||||||
- input: The input for which embeddings are to be generated.
|
- input: The input for which embeddings are to be generated.
|
||||||
|
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
||||||
- timeout: The timeout value for the API call, default 10 mins
|
- timeout: The timeout value for the API call, default 10 mins
|
||||||
- litellm_call_id: The call ID for litellm logging.
|
- litellm_call_id: The call ID for litellm logging.
|
||||||
- litellm_logging_obj: The litellm logging object.
|
- litellm_logging_obj: The litellm logging object.
|
||||||
|
@ -2222,8 +2289,14 @@ def embedding(
|
||||||
encoding_format = kwargs.get("encoding_format", None)
|
encoding_format = kwargs.get("encoding_format", None)
|
||||||
proxy_server_request = kwargs.get("proxy_server_request", None)
|
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||||
aembedding = kwargs.get("aembedding", None)
|
aembedding = kwargs.get("aembedding", None)
|
||||||
|
### CUSTOM MODEL COST ###
|
||||||
|
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
||||||
|
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
||||||
|
input_cost_per_second = kwargs.get("input_cost_per_second", None)
|
||||||
|
output_cost_per_second = kwargs.get("output_cost_per_second", None)
|
||||||
openai_params = [
|
openai_params = [
|
||||||
"user",
|
"user",
|
||||||
|
"dimensions",
|
||||||
"request_timeout",
|
"request_timeout",
|
||||||
"api_base",
|
"api_base",
|
||||||
"api_version",
|
"api_version",
|
||||||
|
@ -2270,6 +2343,8 @@ def embedding(
|
||||||
"tpm",
|
"tpm",
|
||||||
"input_cost_per_token",
|
"input_cost_per_token",
|
||||||
"output_cost_per_token",
|
"output_cost_per_token",
|
||||||
|
"input_cost_per_second",
|
||||||
|
"output_cost_per_second",
|
||||||
"hf_model_name",
|
"hf_model_name",
|
||||||
"proxy_server_request",
|
"proxy_server_request",
|
||||||
"model_info",
|
"model_info",
|
||||||
|
@ -2290,11 +2365,35 @@ def embedding(
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
)
|
)
|
||||||
optional_params = get_optional_params_embeddings(
|
optional_params = get_optional_params_embeddings(
|
||||||
|
model=model,
|
||||||
user=user,
|
user=user,
|
||||||
|
dimensions=dimensions,
|
||||||
encoding_format=encoding_format,
|
encoding_format=encoding_format,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
**non_default_params,
|
**non_default_params,
|
||||||
)
|
)
|
||||||
|
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||||
|
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||||
|
litellm.register_model(
|
||||||
|
{
|
||||||
|
model: {
|
||||||
|
"input_cost_per_token": input_cost_per_token,
|
||||||
|
"output_cost_per_token": output_cost_per_token,
|
||||||
|
"litellm_provider": custom_llm_provider,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if input_cost_per_second is not None: # time based pricing just needs cost in place
|
||||||
|
output_cost_per_second = output_cost_per_second or 0.0
|
||||||
|
litellm.register_model(
|
||||||
|
{
|
||||||
|
model: {
|
||||||
|
"input_cost_per_second": input_cost_per_second,
|
||||||
|
"output_cost_per_second": output_cost_per_second,
|
||||||
|
"litellm_provider": custom_llm_provider,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
response = None
|
response = None
|
||||||
logging = litellm_logging_obj
|
logging = litellm_logging_obj
|
||||||
|
@ -2916,6 +3015,7 @@ def image_generation(
|
||||||
else:
|
else:
|
||||||
model = "dall-e-2"
|
model = "dall-e-2"
|
||||||
custom_llm_provider = "openai" # default to dall-e-2 on openai
|
custom_llm_provider = "openai" # default to dall-e-2 on openai
|
||||||
|
model_response._hidden_params["model"] = model
|
||||||
openai_params = [
|
openai_params = [
|
||||||
"user",
|
"user",
|
||||||
"request_timeout",
|
"request_timeout",
|
||||||
|
@ -2989,7 +3089,7 @@ def image_generation(
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
**non_default_params,
|
**non_default_params,
|
||||||
)
|
)
|
||||||
logging = litellm_logging_obj
|
logging: Logging = litellm_logging_obj
|
||||||
logging.update_environment_variables(
|
logging.update_environment_variables(
|
||||||
model=model,
|
model=model,
|
||||||
user=user,
|
user=user,
|
||||||
|
@ -3089,6 +3189,9 @@ async def ahealth_check(
|
||||||
if model is None:
|
if model is None:
|
||||||
raise Exception("model not set")
|
raise Exception("model not set")
|
||||||
|
|
||||||
|
if model in litellm.model_cost and mode is None:
|
||||||
|
mode = litellm.model_cost[model]["mode"]
|
||||||
|
|
||||||
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
|
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
|
||||||
mode = mode or "chat" # default to chat completion calls
|
mode = mode or "chat" # default to chat completion calls
|
||||||
|
|
||||||
|
@ -3135,6 +3238,7 @@ async def ahealth_check(
|
||||||
or custom_llm_provider == "text-completion-openai"
|
or custom_llm_provider == "text-completion-openai"
|
||||||
):
|
):
|
||||||
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
|
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
|
||||||
|
organization = model_params.get("organization")
|
||||||
|
|
||||||
timeout = (
|
timeout = (
|
||||||
model_params.get("timeout")
|
model_params.get("timeout")
|
||||||
|
@ -3152,6 +3256,7 @@ async def ahealth_check(
|
||||||
mode=mode,
|
mode=mode,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
input=input,
|
input=input,
|
||||||
|
organization=organization,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if mode == "embedding":
|
if mode == "embedding":
|
||||||
|
@ -3176,6 +3281,7 @@ async def ahealth_check(
|
||||||
## Set verbose to true -> ```litellm.set_verbose = True```
|
## Set verbose to true -> ```litellm.set_verbose = True```
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
|
verbose_logger.debug(print_statement)
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
print(print_statement) # noqa
|
print(print_statement) # noqa
|
||||||
except:
|
except:
|
||||||
|
@ -3263,8 +3369,20 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
|
def stream_chunk_builder(
|
||||||
|
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
|
||||||
|
):
|
||||||
model_response = litellm.ModelResponse()
|
model_response = litellm.ModelResponse()
|
||||||
|
### SORT CHUNKS BASED ON CREATED ORDER ##
|
||||||
|
print_verbose("Goes into checking if chunk has hiddden created at param")
|
||||||
|
if chunks[0]._hidden_params.get("created_at", None):
|
||||||
|
print_verbose("Chunks have a created at hidden param")
|
||||||
|
# Sort chunks based on created_at in ascending order
|
||||||
|
chunks = sorted(
|
||||||
|
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
|
||||||
|
)
|
||||||
|
print_verbose("Chunks sorted")
|
||||||
|
|
||||||
# set hidden params from chunk to model_response
|
# set hidden params from chunk to model_response
|
||||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||||
model_response._hidden_params = chunks[0].get("_hidden_params", {})
|
model_response._hidden_params = chunks[0].get("_hidden_params", {})
|
||||||
|
@ -3438,5 +3556,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
|
||||||
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
||||||
)
|
)
|
||||||
return convert_to_model_response_object(
|
return convert_to_model_response_object(
|
||||||
response_object=response, model_response_object=model_response
|
response_object=response,
|
||||||
|
model_response_object=model_response,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
)
|
)
|
||||||
|
|
1
litellm/proxy/_experimental/out/404.html
Normal file
|
@ -0,0 +1 @@
|
||||||
|
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-9a890acb1e81c3fc.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
|
@ -0,0 +1 @@
|
||||||
|
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
|
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[165],{3155:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found",function(){return n(4032)}])},4032:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return i}}),n(6921);let o=n(3827);n(4090);let r={error:{fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},desc:{display:"inline-block"},h1:{display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},h2:{fontSize:14,fontWeight:400,lineHeight:"49px",margin:0}};function i(){return(0,o.jsxs)(o.Fragment,{children:[(0,o.jsx)("title",{children:"404: This page could not be found."}),(0,o.jsx)("div",{style:r.error,children:(0,o.jsxs)("div",{children:[(0,o.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,o.jsx)("h1",{className:"next-error-h1",style:r.h1,children:"404"}),(0,o.jsx)("div",{style:r.desc,children:(0,o.jsx)("h2",{style:r.h2,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,69,744],function(){return e(e.s=3155)}),_N_E=e.O()}]);
|
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{7421:function(n,e,t){Promise.resolve().then(t.t.bind(t,9646,23)),Promise.resolve().then(t.t.bind(t,3385,23))},3385:function(){},9646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=7421)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{2028:function(e,n,t){Promise.resolve().then(t.t.bind(t,7690,23)),Promise.resolve().then(t.t.bind(t,8955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,1902,23)),Promise.resolve().then(t.t.bind(t,1778,23)),Promise.resolve().then(t.t.bind(t,7831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(5317),n(2028)}),_N_E=e.O()}]);
|
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[888],{1597:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_app",function(){return u(7174)}])}},function(n){var _=function(_){return n(n.s=_)};n.O(0,[774,179],function(){return _(1597),_(4546)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[820],{1981:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_error",function(){return u(5103)}])}},function(n){n.O(0,[888,774,179],function(){return n(n.s=1981)}),_N_E=n.O()}]);
|
|
@ -0,0 +1 @@
|
||||||
|
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function s(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={exports:{}},r=!0;try{a[e](n,n.exports,s),r=!1}finally{r&&delete l[e]}return n.exports}s.m=a,e=[],s.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(s.O).every(function(e){return s.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},s.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return s.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},s.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);s.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},s.d(o,u),o},s.d=function(e,t){for(var n in t)s.o(t,n)&&!s.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},s.f={},s.e=function(e){return Promise.all(Object.keys(s.f).reduce(function(t,n){return s.f[n](e,t),t},[]))},s.u=function(e){},s.miniCssF=function(e){return"static/css/7384ba6288e79f81.css"},s.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),s.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",s.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,s.nc&&i.setAttribute("nonce",s.nc),i.setAttribute("data-webpack",o+n),i.src=s.tu(e)),r[e]=[t];var d=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(d.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=d.bind(null,i.onerror),i.onload=d.bind(null,i.onload),c&&document.head.appendChild(i)},s.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},s.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},s.tu=function(e){return s.tt().createScriptURL(e)},s.p="/ui/_next/",i={272:0},s.f.j=function(e,t){var n=s.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=s.p+s.u(e),u=Error();s.l(o,function(t){if(s.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},s.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)s.o(u,n)&&(s.m[n]=u[n]);if(c)var a=c(s)}for(e&&e(t);f<o.length;f++)r=o[f],s.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return s.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
BIN
litellm/proxy/_experimental/out/favicon.ico
Normal file
After Width: | Height: | Size: 25 KiB |
1
litellm/proxy/_experimental/out/index.html
Normal file
8
litellm/proxy/_experimental/out/index.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
2:"$Sreact.suspense"
|
||||||
|
3:I[5250,["291","static/chunks/291-b42f47441ebb3671.js","931","static/chunks/app/page-b376373c879283de.js"],""]
|
||||||
|
4:I[7476,["291","static/chunks/291-b42f47441ebb3671.js","931","static/chunks/app/page-b376373c879283de.js"],""]
|
||||||
|
5:I[5613,[],""]
|
||||||
|
6:I[1778,[],""]
|
||||||
|
0:["FyY3KX_tIybXFPWL2RGlt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col items-center","children":[["$","nav",null,{"className":"left-0 right-0 top-0 flex justify-between items-center h-12","children":[["$","div",null,{"className":"text-left mx-4 my-2 absolute top-0 left-0","children":["$","div",null,{"className":"flex flex-col items-center","children":["$","$L3",null,{"href":"/","children":["$","button",null,{"className":"text-gray-800 text-2xl px-4 py-1 rounded text-center","children":"🚅 LiteLLM"}]}]}]}],["$","div",null,{"className":"text-right mx-4 my-2 absolute top-0 right-0","children":[["$","a",null,{"href":"https://docs.litellm.ai/docs/","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 mr-2 text-center","children":"Docs"}]}],["$","a",null,{"href":"https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 text-center","children":"Schedule Demo"}]}]]}]]}],["$","$L4",null,{}]]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/7384ba6288e79f81.css","precedence":"next","crossOrigin":""}]],"$L7"]]]]
|
||||||
|
7:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||||
|
1:null
|
1
litellm/proxy/_experimental/out/next.svg
Normal file
|
@ -0,0 +1 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>
|
After Width: | Height: | Size: 1.3 KiB |
1
litellm/proxy/_experimental/out/vercel.svg
Normal file
|
@ -0,0 +1 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 283 64"><path fill="black" d="M141 16c-11 0-19 7-19 18s9 18 20 18c7 0 13-3 16-7l-7-5c-2 3-6 4-9 4-5 0-9-3-10-7h28v-3c0-11-8-18-19-18zm-9 15c1-4 4-7 9-7s8 3 9 7h-18zm117-15c-11 0-19 7-19 18s9 18 20 18c6 0 12-3 16-7l-8-5c-2 3-5 4-8 4-5 0-9-3-11-7h28l1-3c0-11-8-18-19-18zm-10 15c2-4 5-7 10-7s8 3 9 7h-19zm-39 3c0 6 4 10 10 10 4 0 7-2 9-5l8 5c-3 5-9 8-17 8-11 0-19-7-19-18s8-18 19-18c8 0 14 3 17 8l-8 5c-2-3-5-5-9-5-6 0-10 4-10 10zm83-29v46h-9V5h9zM37 0l37 64H0L37 0zm92 5-27 48L74 5h10l18 30 17-30h10zm59 12v10l-3-1c-6 0-10 4-10 10v15h-9V17h9v9c0-5 6-9 13-9z"/></svg>
|
After Width: | Height: | Size: 629 B |
|
@ -1,8 +1,17 @@
|
||||||
from pydantic import BaseModel, Extra, Field, root_validator
|
from pydantic import BaseModel, Extra, Field, root_validator, Json
|
||||||
import enum
|
import enum
|
||||||
from typing import Optional, List, Union, Dict, Literal
|
from typing import Optional, List, Union, Dict, Literal, Any
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import uuid, json
|
import uuid, json, sys, os
|
||||||
|
|
||||||
|
|
||||||
|
def hash_token(token: str):
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
# Hash the string using SHA-256
|
||||||
|
hashed_token = hashlib.sha256(token.encode()).hexdigest()
|
||||||
|
|
||||||
|
return hashed_token
|
||||||
|
|
||||||
|
|
||||||
class LiteLLMBase(BaseModel):
|
class LiteLLMBase(BaseModel):
|
||||||
|
@ -13,7 +22,7 @@ class LiteLLMBase(BaseModel):
|
||||||
def json(self, **kwargs):
|
def json(self, **kwargs):
|
||||||
try:
|
try:
|
||||||
return self.model_dump() # noqa
|
return self.model_dump() # noqa
|
||||||
except:
|
except Exception as e:
|
||||||
# if using pydantic v1
|
# if using pydantic v1
|
||||||
return self.dict()
|
return self.dict()
|
||||||
|
|
||||||
|
@ -122,53 +131,64 @@ class ModelParams(LiteLLMBase):
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
class GenerateKeyRequest(LiteLLMBase):
|
class GenerateRequestBase(LiteLLMBase):
|
||||||
duration: Optional[str] = "1h"
|
"""
|
||||||
|
Overlapping schema between key and user generate/update requests
|
||||||
|
"""
|
||||||
|
|
||||||
models: Optional[list] = []
|
models: Optional[list] = []
|
||||||
|
spend: Optional[float] = 0
|
||||||
|
max_budget: Optional[float] = None
|
||||||
|
user_id: Optional[str] = None
|
||||||
|
team_id: Optional[str] = None
|
||||||
|
max_parallel_requests: Optional[int] = None
|
||||||
|
metadata: Optional[dict] = {}
|
||||||
|
tpm_limit: Optional[int] = None
|
||||||
|
rpm_limit: Optional[int] = None
|
||||||
|
budget_duration: Optional[str] = None
|
||||||
|
allowed_cache_controls: Optional[list] = []
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateKeyRequest(GenerateRequestBase):
|
||||||
|
key_alias: Optional[str] = None
|
||||||
|
duration: Optional[str] = None
|
||||||
aliases: Optional[dict] = {}
|
aliases: Optional[dict] = {}
|
||||||
config: Optional[dict] = {}
|
config: Optional[dict] = {}
|
||||||
spend: Optional[float] = 0
|
|
||||||
user_id: Optional[str] = None
|
|
||||||
max_parallel_requests: Optional[int] = None
|
|
||||||
metadata: Optional[dict] = {}
|
|
||||||
|
|
||||||
|
|
||||||
class UpdateKeyRequest(LiteLLMBase):
|
class GenerateKeyResponse(GenerateKeyRequest):
|
||||||
key: str
|
|
||||||
duration: Optional[str] = None
|
|
||||||
models: Optional[list] = None
|
|
||||||
aliases: Optional[dict] = None
|
|
||||||
config: Optional[dict] = None
|
|
||||||
spend: Optional[float] = None
|
|
||||||
user_id: Optional[str] = None
|
|
||||||
max_parallel_requests: Optional[int] = None
|
|
||||||
metadata: Optional[dict] = {}
|
|
||||||
|
|
||||||
|
|
||||||
class UserAPIKeyAuth(LiteLLMBase): # the expected response object for user api key auth
|
|
||||||
"""
|
|
||||||
Return the row in the db
|
|
||||||
"""
|
|
||||||
|
|
||||||
api_key: Optional[str] = None
|
|
||||||
models: list = []
|
|
||||||
aliases: dict = {}
|
|
||||||
config: dict = {}
|
|
||||||
spend: Optional[float] = 0
|
|
||||||
user_id: Optional[str] = None
|
|
||||||
max_parallel_requests: Optional[int] = None
|
|
||||||
duration: str = "1h"
|
|
||||||
metadata: dict = {}
|
|
||||||
|
|
||||||
|
|
||||||
class GenerateKeyResponse(LiteLLMBase):
|
|
||||||
key: str
|
key: str
|
||||||
|
key_name: Optional[str] = None
|
||||||
expires: Optional[datetime]
|
expires: Optional[datetime]
|
||||||
user_id: str
|
user_id: str
|
||||||
|
|
||||||
|
@root_validator(pre=True)
|
||||||
|
def set_model_info(cls, values):
|
||||||
|
if values.get("token") is not None:
|
||||||
|
values.update({"key": values.get("token")})
|
||||||
|
dict_fields = ["metadata", "aliases", "config"]
|
||||||
|
for field in dict_fields:
|
||||||
|
value = values.get(field)
|
||||||
|
if value is not None and isinstance(value, str):
|
||||||
|
try:
|
||||||
|
values[field] = json.loads(value)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise ValueError(f"Field {field} should be a valid dictionary")
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
class UpdateKeyRequest(GenerateKeyRequest):
|
||||||
|
# Note: the defaults of all Params here MUST BE NONE
|
||||||
|
# else they will get overwritten
|
||||||
|
key: str
|
||||||
|
duration: Optional[str] = None
|
||||||
|
spend: Optional[float] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
class DeleteKeyRequest(LiteLLMBase):
|
class DeleteKeyRequest(LiteLLMBase):
|
||||||
keys: List[str]
|
keys: List
|
||||||
|
|
||||||
|
|
||||||
class NewUserRequest(GenerateKeyRequest):
|
class NewUserRequest(GenerateKeyRequest):
|
||||||
|
@ -179,6 +199,14 @@ class NewUserResponse(GenerateKeyResponse):
|
||||||
max_budget: Optional[float] = None
|
max_budget: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
|
class UpdateUserRequest(GenerateRequestBase):
|
||||||
|
# Note: the defaults of all Params here MUST BE NONE
|
||||||
|
# else they will get overwritten
|
||||||
|
user_id: str
|
||||||
|
spend: Optional[float] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
class KeyManagementSystem(enum.Enum):
|
class KeyManagementSystem(enum.Enum):
|
||||||
GOOGLE_KMS = "google_kms"
|
GOOGLE_KMS = "google_kms"
|
||||||
AZURE_KEY_VAULT = "azure_key_vault"
|
AZURE_KEY_VAULT = "azure_key_vault"
|
||||||
|
@ -194,6 +222,7 @@ class DynamoDBArgs(LiteLLMBase):
|
||||||
user_table_name: str = "LiteLLM_UserTable"
|
user_table_name: str = "LiteLLM_UserTable"
|
||||||
key_table_name: str = "LiteLLM_VerificationToken"
|
key_table_name: str = "LiteLLM_VerificationToken"
|
||||||
config_table_name: str = "LiteLLM_Config"
|
config_table_name: str = "LiteLLM_Config"
|
||||||
|
spend_table_name: str = "LiteLLM_SpendLogs"
|
||||||
|
|
||||||
|
|
||||||
class ConfigGeneralSettings(LiteLLMBase):
|
class ConfigGeneralSettings(LiteLLMBase):
|
||||||
|
@ -282,15 +311,39 @@ class ConfigYAML(LiteLLMBase):
|
||||||
|
|
||||||
|
|
||||||
class LiteLLM_VerificationToken(LiteLLMBase):
|
class LiteLLM_VerificationToken(LiteLLMBase):
|
||||||
token: str
|
token: Optional[str] = None
|
||||||
|
key_name: Optional[str] = None
|
||||||
|
key_alias: Optional[str] = None
|
||||||
spend: float = 0.0
|
spend: float = 0.0
|
||||||
expires: Union[str, None]
|
max_budget: Optional[float] = None
|
||||||
models: List[str]
|
expires: Optional[str] = None
|
||||||
aliases: Dict[str, str] = {}
|
models: List = []
|
||||||
config: Dict[str, str] = {}
|
aliases: Dict = {}
|
||||||
user_id: Union[str, None]
|
config: Dict = {}
|
||||||
max_parallel_requests: Union[int, None]
|
user_id: Optional[str] = None
|
||||||
metadata: Dict[str, str] = {}
|
max_parallel_requests: Optional[int] = None
|
||||||
|
metadata: Dict = {}
|
||||||
|
tpm_limit: Optional[int] = None
|
||||||
|
rpm_limit: Optional[int] = None
|
||||||
|
budget_duration: Optional[str] = None
|
||||||
|
budget_reset_at: Optional[datetime] = None
|
||||||
|
allowed_cache_controls: Optional[list] = []
|
||||||
|
|
||||||
|
|
||||||
|
class UserAPIKeyAuth(
|
||||||
|
LiteLLM_VerificationToken
|
||||||
|
): # the expected response object for user api key auth
|
||||||
|
"""
|
||||||
|
Return the row in the db
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
|
||||||
|
@root_validator(pre=True)
|
||||||
|
def check_api_key(cls, values):
|
||||||
|
if values.get("api_key") is not None:
|
||||||
|
values.update({"token": hash_token(values.get("api_key"))})
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
class LiteLLM_Config(LiteLLMBase):
|
class LiteLLM_Config(LiteLLMBase):
|
||||||
|
@ -310,5 +363,22 @@ class LiteLLM_UserTable(LiteLLMBase):
|
||||||
if values.get("spend") is None:
|
if values.get("spend") is None:
|
||||||
values.update({"spend": 0.0})
|
values.update({"spend": 0.0})
|
||||||
if values.get("models") is None:
|
if values.get("models") is None:
|
||||||
values.update({"models", []})
|
values.update({"models": []})
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
class LiteLLM_SpendLogs(LiteLLMBase):
|
||||||
|
request_id: str
|
||||||
|
api_key: str
|
||||||
|
model: Optional[str] = ""
|
||||||
|
call_type: str
|
||||||
|
spend: Optional[float] = 0.0
|
||||||
|
total_tokens: Optional[int] = 0
|
||||||
|
prompt_tokens: Optional[int] = 0
|
||||||
|
completion_tokens: Optional[int] = 0
|
||||||
|
startTime: Union[str, datetime, None]
|
||||||
|
endTime: Union[str, datetime, None]
|
||||||
|
user: Optional[str] = ""
|
||||||
|
metadata: Optional[Json] = {}
|
||||||
|
cache_hit: Optional[str] = "False"
|
||||||
|
cache_key: Optional[str] = None
|
||||||
|
|
|
@ -98,7 +98,7 @@ def list_models():
|
||||||
st.error(f"An error occurred while requesting models: {e}")
|
st.error(f"An error occurred while requesting models: {e}")
|
||||||
else:
|
else:
|
||||||
st.warning(
|
st.warning(
|
||||||
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page."
|
f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -151,7 +151,7 @@ def create_key():
|
||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
st.warning(
|
st.warning(
|
||||||
"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page."
|
f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from litellm.proxy._types import (
|
||||||
LiteLLM_Config,
|
LiteLLM_Config,
|
||||||
LiteLLM_UserTable,
|
LiteLLM_UserTable,
|
||||||
)
|
)
|
||||||
|
from litellm.proxy.utils import hash_token
|
||||||
from litellm import get_secret
|
from litellm import get_secret
|
||||||
from typing import Any, List, Literal, Optional, Union
|
from typing import Any, List, Literal, Optional, Union
|
||||||
import json
|
import json
|
||||||
|
@ -131,10 +132,27 @@ class DynamoDBWrapper(CustomDB):
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'"
|
f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
## Spend
|
||||||
|
try:
|
||||||
|
verbose_proxy_logger.debug("DynamoDB Wrapper - Creating Spend Table")
|
||||||
|
error_occurred = False
|
||||||
|
table = client.table(self.database_arguments.spend_table_name)
|
||||||
|
if not await table.exists():
|
||||||
|
await table.create(
|
||||||
|
self.throughput_type,
|
||||||
|
KeySchema(hash_key=KeySpec("request_id", KeyType.string)),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
error_occurred = True
|
||||||
|
if error_occurred == True:
|
||||||
|
raise Exception(
|
||||||
|
f"Failed to create table - {self.database_arguments.key_table_name}.\nPlease create a new table called {self.database_arguments.key_table_name}\nAND set `hash_key` as 'token'"
|
||||||
|
)
|
||||||
verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()")
|
verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()")
|
||||||
|
|
||||||
async def insert_data(
|
async def insert_data(
|
||||||
self, value: Any, table_name: Literal["user", "key", "config"]
|
self, value: Any, table_name: Literal["user", "key", "config", "spend"]
|
||||||
):
|
):
|
||||||
from aiodynamo.client import Client
|
from aiodynamo.client import Client
|
||||||
from aiodynamo.credentials import Credentials, StaticCredentials
|
from aiodynamo.credentials import Credentials, StaticCredentials
|
||||||
|
@ -166,8 +184,13 @@ class DynamoDBWrapper(CustomDB):
|
||||||
table = client.table(self.database_arguments.key_table_name)
|
table = client.table(self.database_arguments.key_table_name)
|
||||||
elif table_name == "config":
|
elif table_name == "config":
|
||||||
table = client.table(self.database_arguments.config_table_name)
|
table = client.table(self.database_arguments.config_table_name)
|
||||||
|
elif table_name == "spend":
|
||||||
|
table = client.table(self.database_arguments.spend_table_name)
|
||||||
|
|
||||||
|
value = value.copy()
|
||||||
for k, v in value.items():
|
for k, v in value.items():
|
||||||
|
if k == "token" and value[k].startswith("sk-"):
|
||||||
|
value[k] = hash_token(token=v)
|
||||||
if isinstance(v, datetime):
|
if isinstance(v, datetime):
|
||||||
value[k] = v.isoformat()
|
value[k] = v.isoformat()
|
||||||
|
|
||||||
|
@ -224,6 +247,10 @@ class DynamoDBWrapper(CustomDB):
|
||||||
and isinstance(v, str)
|
and isinstance(v, str)
|
||||||
):
|
):
|
||||||
new_response[k] = json.loads(v)
|
new_response[k] = json.loads(v)
|
||||||
|
elif (k == "tpm_limit" or k == "rpm_limit") and isinstance(
|
||||||
|
v, float
|
||||||
|
):
|
||||||
|
new_response[k] = int(v)
|
||||||
else:
|
else:
|
||||||
new_response[k] = v
|
new_response[k] = v
|
||||||
new_response = LiteLLM_VerificationToken(**new_response)
|
new_response = LiteLLM_VerificationToken(**new_response)
|
||||||
|
@ -281,10 +308,13 @@ class DynamoDBWrapper(CustomDB):
|
||||||
# Initialize an empty UpdateExpression
|
# Initialize an empty UpdateExpression
|
||||||
|
|
||||||
actions: List = []
|
actions: List = []
|
||||||
|
value = value.copy()
|
||||||
for k, v in value.items():
|
for k, v in value.items():
|
||||||
# Convert datetime object to ISO8601 string
|
# Convert datetime object to ISO8601 string
|
||||||
if isinstance(v, datetime):
|
if isinstance(v, datetime):
|
||||||
v = v.isoformat()
|
v = v.isoformat()
|
||||||
|
if k == "token" and value[k].startswith("sk-"):
|
||||||
|
value[k] = hash_token(token=v)
|
||||||
|
|
||||||
# Accumulate updates
|
# Accumulate updates
|
||||||
actions.append((F(k), Value(value=v)))
|
actions.append((F(k), Value(value=v)))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from litellm.proxy._types import UserAPIKeyAuth
|
from litellm.proxy._types import UserAPIKeyAuth, GenerateKeyRequest
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
|
@ -14,3 +14,40 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
|
||||||
raise Exception
|
raise Exception
|
||||||
except:
|
except:
|
||||||
raise Exception
|
raise Exception
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_key_fn(data: GenerateKeyRequest):
|
||||||
|
"""
|
||||||
|
Asynchronously decides if a key should be generated or not based on the provided data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (GenerateKeyRequest): The data to be used for decision making.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if a key should be generated, False otherwise.
|
||||||
|
"""
|
||||||
|
# decide if a key should be generated or not
|
||||||
|
data_json = data.json() # type: ignore
|
||||||
|
|
||||||
|
# Unpacking variables
|
||||||
|
team_id = data_json.get("team_id")
|
||||||
|
duration = data_json.get("duration")
|
||||||
|
models = data_json.get("models")
|
||||||
|
aliases = data_json.get("aliases")
|
||||||
|
config = data_json.get("config")
|
||||||
|
spend = data_json.get("spend")
|
||||||
|
user_id = data_json.get("user_id")
|
||||||
|
max_parallel_requests = data_json.get("max_parallel_requests")
|
||||||
|
metadata = data_json.get("metadata")
|
||||||
|
tpm_limit = data_json.get("tpm_limit")
|
||||||
|
rpm_limit = data_json.get("rpm_limit")
|
||||||
|
|
||||||
|
if team_id is not None and len(team_id) > 0:
|
||||||
|
return {
|
||||||
|
"decision": True,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"decision": True,
|
||||||
|
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
|
||||||
|
}
|
||||||
|
|
55
litellm/proxy/hooks/cache_control_check.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
# What this does?
|
||||||
|
## Checks if key is allowed to use the cache controls passed in to the completion() call
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
import litellm
|
||||||
|
from litellm.caching import DualCache
|
||||||
|
from litellm.proxy._types import UserAPIKeyAuth
|
||||||
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from fastapi import HTTPException
|
||||||
|
import json, traceback
|
||||||
|
|
||||||
|
|
||||||
|
class CacheControlCheck(CustomLogger):
|
||||||
|
# Class variables or attributes
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def print_verbose(self, print_statement):
|
||||||
|
if litellm.set_verbose is True:
|
||||||
|
print(print_statement) # noqa
|
||||||
|
|
||||||
|
async def async_pre_call_hook(
|
||||||
|
self,
|
||||||
|
user_api_key_dict: UserAPIKeyAuth,
|
||||||
|
cache: DualCache,
|
||||||
|
data: dict,
|
||||||
|
call_type: str,
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
self.print_verbose(f"Inside Cache Control Check Pre-Call Hook")
|
||||||
|
allowed_cache_controls = user_api_key_dict.allowed_cache_controls
|
||||||
|
|
||||||
|
if (allowed_cache_controls is None) or (
|
||||||
|
len(allowed_cache_controls) == 0
|
||||||
|
): # assume empty list to be nullable - https://github.com/prisma/prisma/issues/847#issuecomment-546895663
|
||||||
|
return
|
||||||
|
|
||||||
|
if data.get("cache", None) is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
cache_args = data.get("cache", None)
|
||||||
|
if isinstance(cache_args, dict):
|
||||||
|
for k, v in cache_args.items():
|
||||||
|
if k not in allowed_cache_controls:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=403,
|
||||||
|
detail=f"Not allowed to set {k} as a cache control. Contact admin to change permissions.",
|
||||||
|
)
|
||||||
|
else: # invalid cache
|
||||||
|
return
|
||||||
|
|
||||||
|
except HTTPException as e:
|
||||||
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
|
@ -1,9 +1,12 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import litellm
|
import litellm, traceback, sys
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.proxy._types import UserAPIKeyAuth
|
from litellm.proxy._types import UserAPIKeyAuth
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
from litellm._logging import verbose_proxy_logger
|
||||||
|
from litellm import ModelResponse
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class MaxParallelRequestsHandler(CustomLogger):
|
class MaxParallelRequestsHandler(CustomLogger):
|
||||||
|
@ -14,8 +17,12 @@ class MaxParallelRequestsHandler(CustomLogger):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def print_verbose(self, print_statement):
|
def print_verbose(self, print_statement):
|
||||||
if litellm.set_verbose is True:
|
try:
|
||||||
print(print_statement) # noqa
|
verbose_proxy_logger.debug(print_statement)
|
||||||
|
if litellm.set_verbose:
|
||||||
|
print(print_statement) # noqa
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
async def async_pre_call_hook(
|
async def async_pre_call_hook(
|
||||||
self,
|
self,
|
||||||
|
@ -26,25 +33,56 @@ class MaxParallelRequestsHandler(CustomLogger):
|
||||||
):
|
):
|
||||||
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
|
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
|
||||||
api_key = user_api_key_dict.api_key
|
api_key = user_api_key_dict.api_key
|
||||||
max_parallel_requests = user_api_key_dict.max_parallel_requests
|
max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
|
||||||
|
tpm_limit = user_api_key_dict.tpm_limit or sys.maxsize
|
||||||
|
rpm_limit = user_api_key_dict.rpm_limit or sys.maxsize
|
||||||
|
|
||||||
if api_key is None:
|
if api_key is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
if max_parallel_requests is None:
|
if (
|
||||||
|
max_parallel_requests == sys.maxsize
|
||||||
|
and tpm_limit == sys.maxsize
|
||||||
|
and rpm_limit == sys.maxsize
|
||||||
|
):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.user_api_key_cache = cache # save the api key cache for updating the value
|
self.user_api_key_cache = cache # save the api key cache for updating the value
|
||||||
|
# ------------
|
||||||
|
# Setup values
|
||||||
|
# ------------
|
||||||
|
|
||||||
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
current_hour = datetime.now().strftime("%H")
|
||||||
|
current_minute = datetime.now().strftime("%M")
|
||||||
|
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||||
|
|
||||||
|
request_count_api_key = f"{api_key}::{precise_minute}::request_count"
|
||||||
|
|
||||||
# CHECK IF REQUEST ALLOWED
|
# CHECK IF REQUEST ALLOWED
|
||||||
request_count_api_key = f"{api_key}_request_count"
|
current = cache.get_cache(
|
||||||
current = cache.get_cache(key=request_count_api_key)
|
key=request_count_api_key
|
||||||
|
) # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
|
||||||
self.print_verbose(f"current: {current}")
|
self.print_verbose(f"current: {current}")
|
||||||
if current is None:
|
if current is None:
|
||||||
cache.set_cache(request_count_api_key, 1)
|
new_val = {
|
||||||
elif int(current) < max_parallel_requests:
|
"current_requests": 1,
|
||||||
|
"current_tpm": 0,
|
||||||
|
"current_rpm": 0,
|
||||||
|
}
|
||||||
|
cache.set_cache(request_count_api_key, new_val)
|
||||||
|
elif (
|
||||||
|
int(current["current_requests"]) < max_parallel_requests
|
||||||
|
and current["current_tpm"] < tpm_limit
|
||||||
|
and current["current_rpm"] < rpm_limit
|
||||||
|
):
|
||||||
# Increase count for this token
|
# Increase count for this token
|
||||||
cache.set_cache(request_count_api_key, int(current) + 1)
|
new_val = {
|
||||||
|
"current_requests": current["current_requests"] + 1,
|
||||||
|
"current_tpm": current["current_tpm"],
|
||||||
|
"current_rpm": current["current_rpm"],
|
||||||
|
}
|
||||||
|
cache.set_cache(request_count_api_key, new_val)
|
||||||
else:
|
else:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429, detail="Max parallel request limit reached."
|
status_code=429, detail="Max parallel request limit reached."
|
||||||
|
@ -52,7 +90,7 @@ class MaxParallelRequestsHandler(CustomLogger):
|
||||||
|
|
||||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
try:
|
try:
|
||||||
self.print_verbose(f"INSIDE ASYNC SUCCESS LOGGING")
|
self.print_verbose(f"INSIDE parallel request limiter ASYNC SUCCESS LOGGING")
|
||||||
user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
|
user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
|
||||||
if user_api_key is None:
|
if user_api_key is None:
|
||||||
return
|
return
|
||||||
|
@ -60,29 +98,50 @@ class MaxParallelRequestsHandler(CustomLogger):
|
||||||
if self.user_api_key_cache is None:
|
if self.user_api_key_cache is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
request_count_api_key = f"{user_api_key}_request_count"
|
# ------------
|
||||||
# check if it has collected an entire stream response
|
# Setup values
|
||||||
self.print_verbose(
|
# ------------
|
||||||
f"'complete_streaming_response' is in kwargs: {'complete_streaming_response' in kwargs}"
|
|
||||||
)
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
if "complete_streaming_response" in kwargs or kwargs["stream"] != True:
|
current_hour = datetime.now().strftime("%H")
|
||||||
# Decrease count for this token
|
current_minute = datetime.now().strftime("%M")
|
||||||
current = (
|
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||||
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
|
|
||||||
)
|
total_tokens = 0
|
||||||
new_val = current - 1
|
|
||||||
self.print_verbose(f"updated_value in success call: {new_val}")
|
if isinstance(response_obj, ModelResponse):
|
||||||
self.user_api_key_cache.set_cache(request_count_api_key, new_val)
|
total_tokens = response_obj.usage.total_tokens
|
||||||
|
|
||||||
|
request_count_api_key = f"{user_api_key}::{precise_minute}::request_count"
|
||||||
|
|
||||||
|
current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
|
||||||
|
"current_requests": 1,
|
||||||
|
"current_tpm": total_tokens,
|
||||||
|
"current_rpm": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ------------
|
||||||
|
# Update usage
|
||||||
|
# ------------
|
||||||
|
|
||||||
|
new_val = {
|
||||||
|
"current_requests": current["current_requests"] - 1,
|
||||||
|
"current_tpm": current["current_tpm"] + total_tokens,
|
||||||
|
"current_rpm": current["current_rpm"] + 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.print_verbose(f"updated_value in success call: {new_val}")
|
||||||
|
self.user_api_key_cache.set_cache(
|
||||||
|
request_count_api_key, new_val, ttl=60
|
||||||
|
) # store in cache for 1 min.
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print_verbose(e) # noqa
|
self.print_verbose(e) # noqa
|
||||||
|
|
||||||
async def async_log_failure_call(
|
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
self, user_api_key_dict: UserAPIKeyAuth, original_exception: Exception
|
|
||||||
):
|
|
||||||
try:
|
try:
|
||||||
self.print_verbose(f"Inside Max Parallel Request Failure Hook")
|
self.print_verbose(f"Inside Max Parallel Request Failure Hook")
|
||||||
api_key = user_api_key_dict.api_key
|
user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
|
||||||
if api_key is None:
|
if user_api_key is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.user_api_key_cache is None:
|
if self.user_api_key_cache is None:
|
||||||
|
@ -90,19 +149,46 @@ class MaxParallelRequestsHandler(CustomLogger):
|
||||||
|
|
||||||
## decrement call count if call failed
|
## decrement call count if call failed
|
||||||
if (
|
if (
|
||||||
hasattr(original_exception, "status_code")
|
hasattr(kwargs["exception"], "status_code")
|
||||||
and original_exception.status_code == 429
|
and kwargs["exception"].status_code == 429
|
||||||
and "Max parallel request limit reached" in str(original_exception)
|
and "Max parallel request limit reached" in str(kwargs["exception"])
|
||||||
):
|
):
|
||||||
pass # ignore failed calls due to max limit being reached
|
pass # ignore failed calls due to max limit being reached
|
||||||
else:
|
else:
|
||||||
request_count_api_key = f"{api_key}_request_count"
|
# ------------
|
||||||
# Decrease count for this token
|
# Setup values
|
||||||
current = (
|
# ------------
|
||||||
self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
|
|
||||||
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
current_hour = datetime.now().strftime("%H")
|
||||||
|
current_minute = datetime.now().strftime("%M")
|
||||||
|
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||||
|
|
||||||
|
request_count_api_key = (
|
||||||
|
f"{user_api_key}::{precise_minute}::request_count"
|
||||||
)
|
)
|
||||||
new_val = current - 1
|
|
||||||
|
# ------------
|
||||||
|
# Update usage
|
||||||
|
# ------------
|
||||||
|
|
||||||
|
current = self.user_api_key_cache.get_cache(
|
||||||
|
key=request_count_api_key
|
||||||
|
) or {
|
||||||
|
"current_requests": 1,
|
||||||
|
"current_tpm": 0,
|
||||||
|
"current_rpm": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
new_val = {
|
||||||
|
"current_requests": current["current_requests"] - 1,
|
||||||
|
"current_tpm": current["current_tpm"],
|
||||||
|
"current_rpm": current["current_rpm"],
|
||||||
|
}
|
||||||
|
|
||||||
self.print_verbose(f"updated_value in failure call: {new_val}")
|
self.print_verbose(f"updated_value in failure call: {new_val}")
|
||||||
self.user_api_key_cache.set_cache(request_count_api_key, new_val)
|
self.user_api_key_cache.set_cache(
|
||||||
|
request_count_api_key, new_val, ttl=60
|
||||||
|
) # save in cache for up to 1 min.
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print_verbose(f"An exception occurred - {str(e)}") # noqa
|
print(f"An exception occurred - {str(e)}") # noqa
|
||||||
|
|
|
@ -157,6 +157,12 @@ def is_port_in_use(port):
|
||||||
type=int,
|
type=int,
|
||||||
help="Number of requests to hit async endpoint with",
|
help="Number of requests to hit async endpoint with",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--run_gunicorn",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="Starts proxy via gunicorn, instead of uvicorn (better for managing multiple workers)",
|
||||||
|
)
|
||||||
@click.option("--local", is_flag=True, default=False, help="for local debugging")
|
@click.option("--local", is_flag=True, default=False, help="for local debugging")
|
||||||
def run_server(
|
def run_server(
|
||||||
host,
|
host,
|
||||||
|
@ -186,21 +192,32 @@ def run_server(
|
||||||
use_queue,
|
use_queue,
|
||||||
health,
|
health,
|
||||||
version,
|
version,
|
||||||
|
run_gunicorn,
|
||||||
):
|
):
|
||||||
global feature_telemetry
|
global feature_telemetry
|
||||||
args = locals()
|
args = locals()
|
||||||
if local:
|
if local:
|
||||||
from proxy_server import app, save_worker_config, usage_telemetry
|
from proxy_server import app, save_worker_config, usage_telemetry, ProxyConfig
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
from .proxy_server import app, save_worker_config, usage_telemetry
|
from .proxy_server import (
|
||||||
|
app,
|
||||||
|
save_worker_config,
|
||||||
|
usage_telemetry,
|
||||||
|
ProxyConfig,
|
||||||
|
)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
if "litellm[proxy]" in str(e):
|
if "litellm[proxy]" in str(e):
|
||||||
# user is missing a proxy dependency, ask them to pip install litellm[proxy]
|
# user is missing a proxy dependency, ask them to pip install litellm[proxy]
|
||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
# this is just a local/relative import error, user git cloned litellm
|
# this is just a local/relative import error, user git cloned litellm
|
||||||
from proxy_server import app, save_worker_config, usage_telemetry
|
from proxy_server import (
|
||||||
|
app,
|
||||||
|
save_worker_config,
|
||||||
|
usage_telemetry,
|
||||||
|
ProxyConfig,
|
||||||
|
)
|
||||||
feature_telemetry = usage_telemetry
|
feature_telemetry = usage_telemetry
|
||||||
if version == True:
|
if version == True:
|
||||||
pkg_version = importlib.metadata.version("litellm")
|
pkg_version = importlib.metadata.version("litellm")
|
||||||
|
@ -373,16 +390,16 @@ def run_server(
|
||||||
read from there and save it to os.env['DATABASE_URL']
|
read from there and save it to os.env['DATABASE_URL']
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml, asyncio
|
||||||
except:
|
except:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
|
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
|
||||||
)
|
)
|
||||||
|
|
||||||
if os.path.exists(config):
|
proxy_config = ProxyConfig()
|
||||||
with open(config, "r") as config_file:
|
_, _, general_settings = asyncio.run(
|
||||||
config = yaml.safe_load(config_file)
|
proxy_config.load_config(router=None, config_file_path=config)
|
||||||
general_settings = config.get("general_settings", {})
|
)
|
||||||
database_url = general_settings.get("database_url", None)
|
database_url = general_settings.get("database_url", None)
|
||||||
if database_url and database_url.startswith("os.environ/"):
|
if database_url and database_url.startswith("os.environ/"):
|
||||||
original_dir = os.getcwd()
|
original_dir = os.getcwd()
|
||||||
|
@ -418,6 +435,7 @@ def run_server(
|
||||||
break # Exit the loop if the subprocess succeeds
|
break # Exit the loop if the subprocess succeeds
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
|
time.sleep(random.randrange(start=1, stop=5))
|
||||||
finally:
|
finally:
|
||||||
os.chdir(original_dir)
|
os.chdir(original_dir)
|
||||||
else:
|
else:
|
||||||
|
@ -428,9 +446,9 @@ def run_server(
|
||||||
port = random.randint(1024, 49152)
|
port = random.randint(1024, 49152)
|
||||||
from litellm.proxy.proxy_server import app
|
from litellm.proxy.proxy_server import app
|
||||||
|
|
||||||
if os.name == "nt":
|
if run_gunicorn == False:
|
||||||
uvicorn.run(app, host=host, port=port) # run uvicorn
|
uvicorn.run(app, host=host, port=port) # run uvicorn
|
||||||
else:
|
elif run_gunicorn == True:
|
||||||
import gunicorn.app.base
|
import gunicorn.app.base
|
||||||
|
|
||||||
# Gunicorn Application Class
|
# Gunicorn Application Class
|
||||||
|
|
|
@ -11,6 +11,12 @@ model_list:
|
||||||
output_cost_per_token: 0.00003
|
output_cost_per_token: 0.00003
|
||||||
max_tokens: 4096
|
max_tokens: 4096
|
||||||
base_model: gpt-3.5-turbo
|
base_model: gpt-3.5-turbo
|
||||||
|
- model_name: gpt-4
|
||||||
|
litellm_params:
|
||||||
|
model: azure/chatgpt-v-2
|
||||||
|
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
||||||
|
api_version: "2023-05-15"
|
||||||
|
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
|
||||||
- model_name: gpt-vision
|
- model_name: gpt-vision
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: azure/gpt-4-vision
|
model: azure/gpt-4-vision
|
||||||
|
@ -25,6 +31,9 @@ model_list:
|
||||||
- model_name: BEDROCK_GROUP
|
- model_name: BEDROCK_GROUP
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: bedrock/cohere.command-text-v14
|
model: bedrock/cohere.command-text-v14
|
||||||
|
- model_name: tg-ai
|
||||||
|
litellm_params:
|
||||||
|
model: together_ai/mistralai/Mistral-7B-Instruct-v0.1
|
||||||
- model_name: sagemaker
|
- model_name: sagemaker
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
|
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
|
||||||
|
@ -57,12 +66,21 @@ model_list:
|
||||||
mode: embedding
|
mode: embedding
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
|
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
|
||||||
|
success_callback: ['langfuse']
|
||||||
|
max_budget: 10 # global budget for proxy
|
||||||
|
budget_duration: 30d # global budget duration, will reset after 30d
|
||||||
|
default_key_generate_params:
|
||||||
|
max_budget: 1.5000
|
||||||
|
models: ["azure-gpt-3.5"]
|
||||||
|
duration: None
|
||||||
# cache: True
|
# cache: True
|
||||||
# setting callback class
|
# setting callback class
|
||||||
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
||||||
|
|
||||||
# general_settings:
|
general_settings:
|
||||||
# master_key: sk-1234
|
master_key: sk-1234
|
||||||
|
alerting: ["slack"]
|
||||||
|
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
|
||||||
# database_type: "dynamo_db"
|
# database_type: "dynamo_db"
|
||||||
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
|
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
|
||||||
# "billing_mode": "PAY_PER_REQUEST",
|
# "billing_mode": "PAY_PER_REQUEST",
|
||||||
|
|
|
@ -7,28 +7,64 @@ generator client {
|
||||||
provider = "prisma-client-py"
|
provider = "prisma-client-py"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track spend, rate limit, budget Users
|
||||||
model LiteLLM_UserTable {
|
model LiteLLM_UserTable {
|
||||||
user_id String @unique
|
user_id String @unique
|
||||||
|
team_id String?
|
||||||
max_budget Float?
|
max_budget Float?
|
||||||
spend Float @default(0.0)
|
spend Float @default(0.0)
|
||||||
user_email String?
|
user_email String?
|
||||||
models String[] @default([])
|
models String[]
|
||||||
|
max_parallel_requests Int?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
allowed_cache_controls String[] @default([])
|
||||||
}
|
}
|
||||||
|
|
||||||
// required for token gen
|
// Generate Tokens for Proxy
|
||||||
model LiteLLM_VerificationToken {
|
model LiteLLM_VerificationToken {
|
||||||
token String @unique
|
token String @unique
|
||||||
|
key_name String?
|
||||||
|
key_alias String?
|
||||||
spend Float @default(0.0)
|
spend Float @default(0.0)
|
||||||
expires DateTime?
|
expires DateTime?
|
||||||
models String[] @default([])
|
models String[]
|
||||||
aliases Json @default("{}")
|
aliases Json @default("{}")
|
||||||
config Json @default("{}")
|
config Json @default("{}")
|
||||||
user_id String?
|
user_id String?
|
||||||
|
team_id String?
|
||||||
max_parallel_requests Int?
|
max_parallel_requests Int?
|
||||||
metadata Json @default("{}")
|
metadata Json @default("{}")
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
max_budget Float?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
allowed_cache_controls String[] @default([])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// store proxy config.yaml
|
||||||
model LiteLLM_Config {
|
model LiteLLM_Config {
|
||||||
param_name String @id
|
param_name String @id
|
||||||
param_value Json?
|
param_value Json?
|
||||||
|
}
|
||||||
|
|
||||||
|
// View spend, model, api_key per request
|
||||||
|
model LiteLLM_SpendLogs {
|
||||||
|
request_id String @unique
|
||||||
|
call_type String
|
||||||
|
api_key String @default ("")
|
||||||
|
spend Float @default(0.0)
|
||||||
|
total_tokens Int @default(0)
|
||||||
|
prompt_tokens Int @default(0)
|
||||||
|
completion_tokens Int @default(0)
|
||||||
|
startTime DateTime // Assuming start_time is a DateTime field
|
||||||
|
endTime DateTime // Assuming end_time is a DateTime field
|
||||||
|
model String @default("")
|
||||||
|
user String @default("")
|
||||||
|
metadata Json @default("{}")
|
||||||
|
cache_hit String @default("")
|
||||||
|
cache_key String @default("")
|
||||||
}
|
}
|
|
@ -11,12 +11,10 @@ async def litellm_completion():
|
||||||
# Your existing code for litellm_completion goes here
|
# Your existing code for litellm_completion goes here
|
||||||
try:
|
try:
|
||||||
response = await litellm_client.chat.completions.create(
|
response = await litellm_client.chat.completions.create(
|
||||||
model="Azure OpenAI GPT-4 Canada-East (External)",
|
model="azure-gpt-3.5",
|
||||||
stream=True,
|
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
)
|
)
|
||||||
async for chunk in response:
|
print(response)
|
||||||
print(chunk)
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -27,9 +25,9 @@ async def litellm_completion():
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
for i in range(1000000):
|
for i in range(150):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
n = 1000 # Number of concurrent tasks
|
n = 150 # Number of concurrent tasks
|
||||||
tasks = [litellm_completion() for _ in range(n)]
|
tasks = [litellm_completion() for _ in range(n)]
|
||||||
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
|
@ -4,22 +4,28 @@ const openai = require('openai');
|
||||||
process.env.DEBUG=false;
|
process.env.DEBUG=false;
|
||||||
async function runOpenAI() {
|
async function runOpenAI() {
|
||||||
const client = new openai.OpenAI({
|
const client = new openai.OpenAI({
|
||||||
apiKey: 'your_api_key_here',
|
apiKey: 'sk-JkKeNi6WpWDngBsghJ6B9g',
|
||||||
baseURL: 'http://0.0.0.0:8000'
|
baseURL: 'http://0.0.0.0:8000'
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await client.chat.completions.create({
|
const response = await client.chat.completions.create({
|
||||||
model: 'azure-gpt-3.5',
|
model: 'sagemaker',
|
||||||
|
stream: true,
|
||||||
|
max_tokens: 1000,
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: 'user',
|
role: 'user',
|
||||||
content: 'this is a test request, write a short poem'.repeat(2000),
|
content: 'write a 20 pg essay about YC ',
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response);
|
console.log(response);
|
||||||
|
for await (const chunk of response) {
|
||||||
|
console.log(chunk);
|
||||||
|
console.log(chunk.choices[0].delta.content);
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log("got this exception from server");
|
console.log("got this exception from server");
|
||||||
console.error(error);
|
console.error(error);
|
||||||
|
|
|
@ -1,21 +1,28 @@
|
||||||
from typing import Optional, List, Any, Literal, Union
|
from typing import Optional, List, Any, Literal, Union
|
||||||
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx
|
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx
|
||||||
import litellm, backoff
|
import litellm, backoff
|
||||||
from litellm.proxy._types import UserAPIKeyAuth, DynamoDBArgs, LiteLLM_VerificationToken
|
from litellm.proxy._types import (
|
||||||
|
UserAPIKeyAuth,
|
||||||
|
DynamoDBArgs,
|
||||||
|
LiteLLM_VerificationToken,
|
||||||
|
LiteLLM_SpendLogs,
|
||||||
|
)
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
|
from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
|
||||||
from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter
|
from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter
|
||||||
|
from litellm.proxy.hooks.cache_control_check import CacheControlCheck
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.proxy.db.base_client import CustomDB
|
from litellm.proxy.db.base_client import CustomDB
|
||||||
from litellm._logging import verbose_proxy_logger
|
from litellm._logging import verbose_proxy_logger
|
||||||
from fastapi import HTTPException, status
|
from fastapi import HTTPException, status
|
||||||
import smtplib
|
import smtplib, re
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
from email.mime.multipart import MIMEMultipart
|
from email.mime.multipart import MIMEMultipart
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
|
verbose_proxy_logger.debug(print_statement)
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
print(f"LiteLLM Proxy: {print_statement}") # noqa
|
print(f"LiteLLM Proxy: {print_statement}") # noqa
|
||||||
|
|
||||||
|
@ -36,6 +43,7 @@ class ProxyLogging:
|
||||||
self.call_details["user_api_key_cache"] = user_api_key_cache
|
self.call_details["user_api_key_cache"] = user_api_key_cache
|
||||||
self.max_parallel_request_limiter = MaxParallelRequestsHandler()
|
self.max_parallel_request_limiter = MaxParallelRequestsHandler()
|
||||||
self.max_budget_limiter = MaxBudgetLimiter()
|
self.max_budget_limiter = MaxBudgetLimiter()
|
||||||
|
self.cache_control_check = CacheControlCheck()
|
||||||
self.alerting: Optional[List] = None
|
self.alerting: Optional[List] = None
|
||||||
self.alerting_threshold: float = 300 # default to 5 min. threshold
|
self.alerting_threshold: float = 300 # default to 5 min. threshold
|
||||||
pass
|
pass
|
||||||
|
@ -51,6 +59,7 @@ class ProxyLogging:
|
||||||
print_verbose(f"INITIALIZING LITELLM CALLBACKS!")
|
print_verbose(f"INITIALIZING LITELLM CALLBACKS!")
|
||||||
litellm.callbacks.append(self.max_parallel_request_limiter)
|
litellm.callbacks.append(self.max_parallel_request_limiter)
|
||||||
litellm.callbacks.append(self.max_budget_limiter)
|
litellm.callbacks.append(self.max_budget_limiter)
|
||||||
|
litellm.callbacks.append(self.cache_control_check)
|
||||||
for callback in litellm.callbacks:
|
for callback in litellm.callbacks:
|
||||||
if callback not in litellm.input_callback:
|
if callback not in litellm.input_callback:
|
||||||
litellm.input_callback.append(callback)
|
litellm.input_callback.append(callback)
|
||||||
|
@ -91,8 +100,9 @@ class ProxyLogging:
|
||||||
2. /embeddings
|
2. /embeddings
|
||||||
3. /image/generation
|
3. /image/generation
|
||||||
"""
|
"""
|
||||||
|
print_verbose(f"Inside Proxy Logging Pre-call hook!")
|
||||||
### ALERTING ###
|
### ALERTING ###
|
||||||
asyncio.create_task(self.response_taking_too_long())
|
asyncio.create_task(self.response_taking_too_long(request_data=data))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for callback in litellm.callbacks:
|
for callback in litellm.callbacks:
|
||||||
|
@ -132,27 +142,113 @@ class ProxyLogging:
|
||||||
start_time: Optional[float] = None,
|
start_time: Optional[float] = None,
|
||||||
end_time: Optional[float] = None,
|
end_time: Optional[float] = None,
|
||||||
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
||||||
|
request_data: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
|
if request_data is not None:
|
||||||
|
model = request_data.get("model", "")
|
||||||
|
messages = request_data.get("messages", "")
|
||||||
|
# try casting messages to str and get the first 100 characters, else mark as None
|
||||||
|
try:
|
||||||
|
messages = str(messages)
|
||||||
|
messages = messages[:10000]
|
||||||
|
except:
|
||||||
|
messages = None
|
||||||
|
|
||||||
|
request_info = f"\nRequest Model: {model}\nMessages: {messages}"
|
||||||
|
else:
|
||||||
|
request_info = ""
|
||||||
|
|
||||||
if type == "hanging_request":
|
if type == "hanging_request":
|
||||||
# Simulate a long-running operation that could take more than 5 minutes
|
# Simulate a long-running operation that could take more than 5 minutes
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
self.alerting_threshold
|
self.alerting_threshold
|
||||||
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
|
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
|
||||||
|
if (
|
||||||
await self.alerting_handler(
|
request_data is not None
|
||||||
message=f"Requests are hanging - {self.alerting_threshold}s+ request time",
|
and request_data.get("litellm_status", "") != "success"
|
||||||
level="Medium",
|
):
|
||||||
)
|
# only alert hanging responses if they have not been marked as success
|
||||||
|
alerting_message = (
|
||||||
|
f"Requests are hanging - {self.alerting_threshold}s+ request time"
|
||||||
|
)
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=alerting_message + request_info,
|
||||||
|
level="Medium",
|
||||||
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
type == "slow_response" and start_time is not None and end_time is not None
|
type == "slow_response" and start_time is not None and end_time is not None
|
||||||
):
|
):
|
||||||
|
slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s"
|
||||||
if end_time - start_time > self.alerting_threshold:
|
if end_time - start_time > self.alerting_threshold:
|
||||||
await self.alerting_handler(
|
await self.alerting_handler(
|
||||||
message=f"Responses are slow - {round(end_time-start_time,2)}s response time",
|
message=slow_message + request_info,
|
||||||
level="Low",
|
level="Low",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def budget_alerts(
|
||||||
|
self,
|
||||||
|
type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
|
||||||
|
user_max_budget: float,
|
||||||
|
user_current_spend: float,
|
||||||
|
user_info=None,
|
||||||
|
):
|
||||||
|
if self.alerting is None:
|
||||||
|
# do nothing if alerting is not switched on
|
||||||
|
return
|
||||||
|
|
||||||
|
if type == "user_and_proxy_budget":
|
||||||
|
user_info = dict(user_info)
|
||||||
|
user_id = user_info["user_id"]
|
||||||
|
max_budget = user_info["max_budget"]
|
||||||
|
spend = user_info["spend"]
|
||||||
|
user_email = user_info["user_email"]
|
||||||
|
user_info = f"""\nUser ID: {user_id}\nMax Budget: ${max_budget}\nSpend: ${spend}\nUser Email: {user_email}"""
|
||||||
|
elif type == "token_budget":
|
||||||
|
token_info = dict(user_info)
|
||||||
|
token = token_info["token"]
|
||||||
|
spend = token_info["spend"]
|
||||||
|
max_budget = token_info["max_budget"]
|
||||||
|
user_id = token_info["user_id"]
|
||||||
|
user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}"""
|
||||||
|
else:
|
||||||
|
user_info = str(user_info)
|
||||||
|
# percent of max_budget left to spend
|
||||||
|
percent_left = (user_max_budget - user_current_spend) / user_max_budget
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"Budget Alerts: Percent left: {percent_left} for {user_info}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# check if crossed budget
|
||||||
|
if user_current_spend >= user_max_budget:
|
||||||
|
verbose_proxy_logger.debug(f"Budget Crossed for {user_info}")
|
||||||
|
message = "Budget Crossed for" + user_info
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=message,
|
||||||
|
level="High",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# check if 5% of max budget is left
|
||||||
|
if percent_left <= 0.05:
|
||||||
|
message = "5% budget left for" + user_info
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=message,
|
||||||
|
level="Medium",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# check if 15% of max budget is left
|
||||||
|
if percent_left <= 0.15:
|
||||||
|
message = "15% budget left for" + user_info
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=message,
|
||||||
|
level="Low",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
async def alerting_handler(
|
async def alerting_handler(
|
||||||
self, message: str, level: Literal["Low", "Medium", "High"]
|
self, message: str, level: Literal["Low", "Medium", "High"]
|
||||||
):
|
):
|
||||||
|
@ -163,12 +259,20 @@ class ProxyLogging:
|
||||||
- Requests are hanging
|
- Requests are hanging
|
||||||
- Calls are failing
|
- Calls are failing
|
||||||
- DB Read/Writes are failing
|
- DB Read/Writes are failing
|
||||||
|
- Proxy Close to max budget
|
||||||
|
- Key Close to max budget
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
|
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
|
||||||
message: str - what is the alert about
|
message: str - what is the alert about
|
||||||
"""
|
"""
|
||||||
formatted_message = f"Level: {level}\n\nMessage: {message}"
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Get the current timestamp
|
||||||
|
current_time = datetime.now().strftime("%H:%M:%S")
|
||||||
|
formatted_message = (
|
||||||
|
f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}"
|
||||||
|
)
|
||||||
if self.alerting is None:
|
if self.alerting is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -179,7 +283,9 @@ class ProxyLogging:
|
||||||
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
|
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
|
||||||
payload = {"text": formatted_message}
|
payload = {"text": formatted_message}
|
||||||
headers = {"Content-type": "application/json"}
|
headers = {"Content-type": "application/json"}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(
|
||||||
|
connector=aiohttp.TCPConnector(ssl=False)
|
||||||
|
) as session:
|
||||||
async with session.post(
|
async with session.post(
|
||||||
slack_webhook_url, json=payload, headers=headers
|
slack_webhook_url, json=payload, headers=headers
|
||||||
) as response:
|
) as response:
|
||||||
|
@ -316,7 +422,7 @@ class PrismaClient:
|
||||||
self,
|
self,
|
||||||
key: str,
|
key: str,
|
||||||
value: Any,
|
value: Any,
|
||||||
table_name: Literal["users", "keys", "config"],
|
table_name: Literal["users", "keys", "config", "spend"],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generic implementation of get data
|
Generic implementation of get data
|
||||||
|
@ -334,6 +440,10 @@ class PrismaClient:
|
||||||
response = await self.db.litellm_config.find_first( # type: ignore
|
response = await self.db.litellm_config.find_first( # type: ignore
|
||||||
where={key: value} # type: ignore
|
where={key: value} # type: ignore
|
||||||
)
|
)
|
||||||
|
elif table_name == "spend":
|
||||||
|
response = await self.db.l.find_first( # type: ignore
|
||||||
|
where={key: value} # type: ignore
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
|
@ -352,8 +462,12 @@ class PrismaClient:
|
||||||
self,
|
self,
|
||||||
token: Optional[str] = None,
|
token: Optional[str] = None,
|
||||||
user_id: Optional[str] = None,
|
user_id: Optional[str] = None,
|
||||||
table_name: Optional[Literal["user", "key", "config"]] = None,
|
user_id_list: Optional[list] = None,
|
||||||
|
key_val: Optional[dict] = None,
|
||||||
|
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
|
||||||
query_type: Literal["find_unique", "find_all"] = "find_unique",
|
query_type: Literal["find_unique", "find_all"] = "find_unique",
|
||||||
|
expires: Optional[datetime] = None,
|
||||||
|
reset_at: Optional[datetime] = None,
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
print_verbose("PrismaClient: get_data")
|
print_verbose("PrismaClient: get_data")
|
||||||
|
@ -365,14 +479,18 @@ class PrismaClient:
|
||||||
hashed_token = token
|
hashed_token = token
|
||||||
if token.startswith("sk-"):
|
if token.startswith("sk-"):
|
||||||
hashed_token = self.hash_token(token=token)
|
hashed_token = self.hash_token(token=token)
|
||||||
print_verbose("PrismaClient: find_unique")
|
verbose_proxy_logger.debug(
|
||||||
|
f"PrismaClient: find_unique for token: {hashed_token}"
|
||||||
|
)
|
||||||
if query_type == "find_unique":
|
if query_type == "find_unique":
|
||||||
response = await self.db.litellm_verificationtoken.find_unique(
|
response = await self.db.litellm_verificationtoken.find_unique(
|
||||||
where={"token": hashed_token}
|
where={"token": hashed_token}
|
||||||
)
|
)
|
||||||
if response is not None:
|
if response is not None:
|
||||||
# for prisma we need to cast the expires time to str
|
# for prisma we need to cast the expires time to str
|
||||||
if isinstance(response.expires, datetime):
|
if response.expires is not None and isinstance(
|
||||||
|
response.expires, datetime
|
||||||
|
):
|
||||||
response.expires = response.expires.isoformat()
|
response.expires = response.expires.isoformat()
|
||||||
elif query_type == "find_all" and user_id is not None:
|
elif query_type == "find_all" and user_id is not None:
|
||||||
response = await self.db.litellm_verificationtoken.find_many(
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
|
@ -382,6 +500,28 @@ class PrismaClient:
|
||||||
for r in response:
|
for r in response:
|
||||||
if isinstance(r.expires, datetime):
|
if isinstance(r.expires, datetime):
|
||||||
r.expires = r.expires.isoformat()
|
r.expires = r.expires.isoformat()
|
||||||
|
elif (
|
||||||
|
query_type == "find_all"
|
||||||
|
and expires is not None
|
||||||
|
and reset_at is not None
|
||||||
|
):
|
||||||
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
|
where={ # type:ignore
|
||||||
|
"OR": [
|
||||||
|
{"expires": None},
|
||||||
|
{"expires": {"gt": expires}},
|
||||||
|
],
|
||||||
|
"budget_reset_at": {"lt": reset_at},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if response is not None and len(response) > 0:
|
||||||
|
for r in response:
|
||||||
|
if isinstance(r.expires, datetime):
|
||||||
|
r.expires = r.expires.isoformat()
|
||||||
|
elif query_type == "find_all":
|
||||||
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
|
order={"spend": "desc"},
|
||||||
|
)
|
||||||
print_verbose(f"PrismaClient: response={response}")
|
print_verbose(f"PrismaClient: response={response}")
|
||||||
if response is not None:
|
if response is not None:
|
||||||
return response
|
return response
|
||||||
|
@ -391,13 +531,61 @@ class PrismaClient:
|
||||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
detail="Authentication Error: invalid user key - token does not exist",
|
detail="Authentication Error: invalid user key - token does not exist",
|
||||||
)
|
)
|
||||||
elif user_id is not None:
|
elif user_id is not None or (
|
||||||
response = await self.db.litellm_usertable.find_unique( # type: ignore
|
table_name is not None and table_name == "user"
|
||||||
where={
|
):
|
||||||
"user_id": user_id,
|
if query_type == "find_unique":
|
||||||
}
|
response = await self.db.litellm_usertable.find_unique( # type: ignore
|
||||||
)
|
where={
|
||||||
|
"user_id": user_id, # type: ignore
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif query_type == "find_all" and reset_at is not None:
|
||||||
|
response = await self.db.litellm_usertable.find_many(
|
||||||
|
where={ # type:ignore
|
||||||
|
"budget_reset_at": {"lt": reset_at},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif query_type == "find_all" and user_id_list is not None:
|
||||||
|
user_id_values = str(tuple(user_id_list))
|
||||||
|
sql_query = f"""
|
||||||
|
SELECT *
|
||||||
|
FROM "LiteLLM_UserTable"
|
||||||
|
WHERE "user_id" IN {user_id_values}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Execute the raw query
|
||||||
|
# The asterisk before `user_id_list` unpacks the list into separate arguments
|
||||||
|
response = await self.db.query_raw(sql_query)
|
||||||
|
elif query_type == "find_all":
|
||||||
|
response = await self.db.litellm_usertable.find_many( # type: ignore
|
||||||
|
order={"spend": "desc"},
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
elif table_name == "spend":
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"PrismaClient: get_data: table_name == 'spend'"
|
||||||
|
)
|
||||||
|
if key_val is not None:
|
||||||
|
if query_type == "find_unique":
|
||||||
|
response = await self.db.litellm_spendlogs.find_unique( # type: ignore
|
||||||
|
where={ # type: ignore
|
||||||
|
key_val["key"]: key_val["value"], # type: ignore
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif query_type == "find_all":
|
||||||
|
response = await self.db.litellm_spendlogs.find_many( # type: ignore
|
||||||
|
where={
|
||||||
|
key_val["key"]: key_val["value"], # type: ignore
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
else:
|
||||||
|
response = await self.db.litellm_spendlogs.find_many( # type: ignore
|
||||||
|
order={"startTime": "desc"},
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
@ -417,7 +605,7 @@ class PrismaClient:
|
||||||
on_backoff=on_backoff, # specifying the function to call on backoff
|
on_backoff=on_backoff, # specifying the function to call on backoff
|
||||||
)
|
)
|
||||||
async def insert_data(
|
async def insert_data(
|
||||||
self, data: dict, table_name: Literal["user", "key", "config"]
|
self, data: dict, table_name: Literal["user", "key", "config", "spend"]
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Add a key to the database. If it already exists, do nothing.
|
Add a key to the database. If it already exists, do nothing.
|
||||||
|
@ -440,6 +628,7 @@ class PrismaClient:
|
||||||
"update": {}, # don't do anything if it already exists
|
"update": {}, # don't do anything if it already exists
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
verbose_proxy_logger.info(f"Data Inserted into Keys Table")
|
||||||
return new_verification_token
|
return new_verification_token
|
||||||
elif table_name == "user":
|
elif table_name == "user":
|
||||||
db_data = self.jsonify_object(data=data)
|
db_data = self.jsonify_object(data=data)
|
||||||
|
@ -450,6 +639,7 @@ class PrismaClient:
|
||||||
"update": {}, # don't do anything if it already exists
|
"update": {}, # don't do anything if it already exists
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
verbose_proxy_logger.info(f"Data Inserted into User Table")
|
||||||
return new_user_row
|
return new_user_row
|
||||||
elif table_name == "config":
|
elif table_name == "config":
|
||||||
"""
|
"""
|
||||||
|
@ -473,8 +663,20 @@ class PrismaClient:
|
||||||
)
|
)
|
||||||
|
|
||||||
tasks.append(updated_table_row)
|
tasks.append(updated_table_row)
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
verbose_proxy_logger.info(f"Data Inserted into Config Table")
|
||||||
|
elif table_name == "spend":
|
||||||
|
db_data = self.jsonify_object(data=data)
|
||||||
|
new_spend_row = await self.db.litellm_spendlogs.upsert(
|
||||||
|
where={"request_id": data["request_id"]},
|
||||||
|
data={
|
||||||
|
"create": {**db_data}, # type: ignore
|
||||||
|
"update": {}, # don't do anything if it already exists
|
||||||
|
},
|
||||||
|
)
|
||||||
|
verbose_proxy_logger.info(f"Data Inserted into Spend Table")
|
||||||
|
return new_spend_row
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
|
@ -494,7 +696,11 @@ class PrismaClient:
|
||||||
self,
|
self,
|
||||||
token: Optional[str] = None,
|
token: Optional[str] = None,
|
||||||
data: dict = {},
|
data: dict = {},
|
||||||
|
data_list: Optional[List] = None,
|
||||||
user_id: Optional[str] = None,
|
user_id: Optional[str] = None,
|
||||||
|
query_type: Literal["update", "update_many"] = "update",
|
||||||
|
table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
|
||||||
|
update_key_values: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Update existing data
|
Update existing data
|
||||||
|
@ -511,17 +717,95 @@ class PrismaClient:
|
||||||
where={"token": token}, # type: ignore
|
where={"token": token}, # type: ignore
|
||||||
data={**db_data}, # type: ignore
|
data={**db_data}, # type: ignore
|
||||||
)
|
)
|
||||||
print_verbose("\033[91m" + f"DB write succeeded {response}" + "\033[0m")
|
verbose_proxy_logger.debug(
|
||||||
|
"\033[91m"
|
||||||
|
+ f"DB Token Table update succeeded {response}"
|
||||||
|
+ "\033[0m"
|
||||||
|
)
|
||||||
return {"token": token, "data": db_data}
|
return {"token": token, "data": db_data}
|
||||||
elif user_id is not None:
|
elif (
|
||||||
|
user_id is not None
|
||||||
|
or (table_name is not None and table_name == "user")
|
||||||
|
and query_type == "update"
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
If data['spend'] + data['user'], update the user table with spend info as well
|
If data['spend'] + data['user'], update the user table with spend info as well
|
||||||
"""
|
"""
|
||||||
update_user_row = await self.db.litellm_usertable.update(
|
if user_id is None:
|
||||||
|
user_id = db_data["user_id"]
|
||||||
|
if update_key_values is None:
|
||||||
|
update_key_values = db_data
|
||||||
|
update_user_row = await self.db.litellm_usertable.upsert(
|
||||||
where={"user_id": user_id}, # type: ignore
|
where={"user_id": user_id}, # type: ignore
|
||||||
data={**db_data}, # type: ignore
|
data={
|
||||||
|
"create": {**db_data}, # type: ignore
|
||||||
|
"update": {
|
||||||
|
**update_key_values # type: ignore
|
||||||
|
}, # just update user-specified values, if it already exists
|
||||||
|
},
|
||||||
|
)
|
||||||
|
verbose_proxy_logger.info(
|
||||||
|
"\033[91m"
|
||||||
|
+ f"DB User Table - update succeeded {update_user_row}"
|
||||||
|
+ "\033[0m"
|
||||||
)
|
)
|
||||||
return {"user_id": user_id, "data": db_data}
|
return {"user_id": user_id, "data": db_data}
|
||||||
|
elif (
|
||||||
|
table_name is not None
|
||||||
|
and table_name == "key"
|
||||||
|
and query_type == "update_many"
|
||||||
|
and data_list is not None
|
||||||
|
and isinstance(data_list, list)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Batch write update queries
|
||||||
|
"""
|
||||||
|
batcher = self.db.batch_()
|
||||||
|
for idx, t in enumerate(data_list):
|
||||||
|
# check if plain text or hash
|
||||||
|
if t.token.startswith("sk-"): # type: ignore
|
||||||
|
t.token = self.hash_token(token=t.token) # type: ignore
|
||||||
|
try:
|
||||||
|
data_json = self.jsonify_object(data=t.model_dump())
|
||||||
|
except:
|
||||||
|
data_json = self.jsonify_object(data=t.dict())
|
||||||
|
batcher.litellm_verificationtoken.update(
|
||||||
|
where={"token": t.token}, # type: ignore
|
||||||
|
data={**data_json}, # type: ignore
|
||||||
|
)
|
||||||
|
await batcher.commit()
|
||||||
|
print_verbose(
|
||||||
|
"\033[91m" + f"DB Token Table update succeeded" + "\033[0m"
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
table_name is not None
|
||||||
|
and table_name == "user"
|
||||||
|
and query_type == "update_many"
|
||||||
|
and data_list is not None
|
||||||
|
and isinstance(data_list, list)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Batch write update queries
|
||||||
|
"""
|
||||||
|
batcher = self.db.batch_()
|
||||||
|
for idx, user in enumerate(data_list):
|
||||||
|
try:
|
||||||
|
data_json = self.jsonify_object(data=user.model_dump())
|
||||||
|
except:
|
||||||
|
data_json = self.jsonify_object(data=user.dict())
|
||||||
|
batcher.litellm_usertable.upsert(
|
||||||
|
where={"user_id": user.user_id}, # type: ignore
|
||||||
|
data={
|
||||||
|
"create": {**data_json}, # type: ignore
|
||||||
|
"update": {
|
||||||
|
**data_json # type: ignore
|
||||||
|
}, # just update user-specified values, if it already exists
|
||||||
|
},
|
||||||
|
)
|
||||||
|
await batcher.commit()
|
||||||
|
verbose_proxy_logger.info(
|
||||||
|
"\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(original_exception=e)
|
||||||
|
@ -542,7 +826,13 @@ class PrismaClient:
|
||||||
Allow user to delete a key(s)
|
Allow user to delete a key(s)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
hashed_tokens = [self.hash_token(token=token) for token in tokens]
|
hashed_tokens = []
|
||||||
|
for token in tokens:
|
||||||
|
if isinstance(token, str) and token.startswith("sk-"):
|
||||||
|
hashed_token = self.hash_token(token=token)
|
||||||
|
else:
|
||||||
|
hashed_token = token
|
||||||
|
hashed_tokens.append(hashed_token)
|
||||||
await self.db.litellm_verificationtoken.delete_many(
|
await self.db.litellm_verificationtoken.delete_many(
|
||||||
where={"token": {"in": hashed_tokens}}
|
where={"token": {"in": hashed_tokens}}
|
||||||
)
|
)
|
||||||
|
@ -750,7 +1040,8 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
|
||||||
print_verbose(f"SMTP Connection Init")
|
print_verbose(f"SMTP Connection Init")
|
||||||
# Establish a secure connection with the SMTP server
|
# Establish a secure connection with the SMTP server
|
||||||
with smtplib.SMTP(smtp_host, smtp_port) as server:
|
with smtplib.SMTP(smtp_host, smtp_port) as server:
|
||||||
server.starttls()
|
if os.getenv("SMTP_TLS", "True") != "False":
|
||||||
|
server.starttls()
|
||||||
|
|
||||||
# Login to your email account
|
# Login to your email account
|
||||||
server.login(smtp_username, smtp_password)
|
server.login(smtp_username, smtp_password)
|
||||||
|
@ -759,4 +1050,228 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
|
||||||
server.send_message(email_message)
|
server.send_message(email_message)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose("An error occurred while sending the email:", str(e))
|
print_verbose("An error occurred while sending the email:" + str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def hash_token(token: str):
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
# Hash the string using SHA-256
|
||||||
|
hashed_token = hashlib.sha256(token.encode()).hexdigest()
|
||||||
|
|
||||||
|
return hashed_token
|
||||||
|
|
||||||
|
|
||||||
|
def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
||||||
|
from litellm.proxy._types import LiteLLM_SpendLogs
|
||||||
|
from pydantic import Json
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"SpendTable: get_logging_payload - kwargs: {kwargs}\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
if kwargs == None:
|
||||||
|
kwargs = {}
|
||||||
|
# standardize this function to be used across, s3, dynamoDB, langfuse logging
|
||||||
|
litellm_params = kwargs.get("litellm_params", {})
|
||||||
|
metadata = (
|
||||||
|
litellm_params.get("metadata", {}) or {}
|
||||||
|
) # if litellm_params['metadata'] == None
|
||||||
|
call_type = kwargs.get("call_type", "litellm.completion")
|
||||||
|
cache_hit = kwargs.get("cache_hit", False)
|
||||||
|
usage = response_obj["usage"]
|
||||||
|
if type(usage) == litellm.Usage:
|
||||||
|
usage = dict(usage)
|
||||||
|
id = response_obj.get("id", str(uuid.uuid4()))
|
||||||
|
api_key = metadata.get("user_api_key", "")
|
||||||
|
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
|
||||||
|
# hash the api_key
|
||||||
|
api_key = hash_token(api_key)
|
||||||
|
if "headers" in metadata and "authorization" in metadata["headers"]:
|
||||||
|
metadata["headers"].pop(
|
||||||
|
"authorization"
|
||||||
|
) # do not store the original `sk-..` api key in the db
|
||||||
|
if litellm.cache is not None:
|
||||||
|
cache_key = litellm.cache.get_cache_key(**kwargs)
|
||||||
|
else:
|
||||||
|
cache_key = "Cache OFF"
|
||||||
|
if cache_hit == True:
|
||||||
|
import time
|
||||||
|
|
||||||
|
id = f"{id}_cache_hit{time.time()}" # SpendLogs does not allow duplicate request_id
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"request_id": id,
|
||||||
|
"call_type": call_type,
|
||||||
|
"api_key": api_key,
|
||||||
|
"cache_hit": cache_hit,
|
||||||
|
"startTime": start_time,
|
||||||
|
"endTime": end_time,
|
||||||
|
"model": kwargs.get("model", ""),
|
||||||
|
"user": kwargs.get("user", ""),
|
||||||
|
"metadata": metadata,
|
||||||
|
"cache_key": cache_key,
|
||||||
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
json_fields = [
|
||||||
|
field
|
||||||
|
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
|
||||||
|
if field_type == Json or field_type == Optional[Json]
|
||||||
|
]
|
||||||
|
str_fields = [
|
||||||
|
field
|
||||||
|
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
|
||||||
|
if field_type == str or field_type == Optional[str]
|
||||||
|
]
|
||||||
|
datetime_fields = [
|
||||||
|
field
|
||||||
|
for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
|
||||||
|
if field_type == datetime
|
||||||
|
]
|
||||||
|
|
||||||
|
for param in json_fields:
|
||||||
|
if param in payload and type(payload[param]) != Json:
|
||||||
|
if type(payload[param]) == litellm.ModelResponse:
|
||||||
|
payload[param] = payload[param].model_dump_json()
|
||||||
|
if type(payload[param]) == litellm.EmbeddingResponse:
|
||||||
|
payload[param] = payload[param].model_dump_json()
|
||||||
|
else:
|
||||||
|
payload[param] = json.dumps(payload[param])
|
||||||
|
|
||||||
|
for param in str_fields:
|
||||||
|
if param in payload and type(payload[param]) != str:
|
||||||
|
payload[param] = str(payload[param])
|
||||||
|
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def _duration_in_seconds(duration: str):
|
||||||
|
match = re.match(r"(\d+)([smhd]?)", duration)
|
||||||
|
if not match:
|
||||||
|
raise ValueError("Invalid duration format")
|
||||||
|
|
||||||
|
value, unit = match.groups()
|
||||||
|
value = int(value)
|
||||||
|
|
||||||
|
if unit == "s":
|
||||||
|
return value
|
||||||
|
elif unit == "m":
|
||||||
|
return value * 60
|
||||||
|
elif unit == "h":
|
||||||
|
return value * 3600
|
||||||
|
elif unit == "d":
|
||||||
|
return value * 86400
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported duration unit")
|
||||||
|
|
||||||
|
|
||||||
|
async def reset_budget(prisma_client: PrismaClient):
|
||||||
|
"""
|
||||||
|
Gets all the non-expired keys for a db, which need spend to be reset
|
||||||
|
|
||||||
|
Resets their spend
|
||||||
|
|
||||||
|
Updates db
|
||||||
|
"""
|
||||||
|
if prisma_client is not None:
|
||||||
|
### RESET KEY BUDGET ###
|
||||||
|
now = datetime.utcnow()
|
||||||
|
keys_to_reset = await prisma_client.get_data(
|
||||||
|
table_name="key", query_type="find_all", expires=now, reset_at=now
|
||||||
|
)
|
||||||
|
|
||||||
|
if keys_to_reset is not None and len(keys_to_reset) > 0:
|
||||||
|
for key in keys_to_reset:
|
||||||
|
key.spend = 0.0
|
||||||
|
duration_s = _duration_in_seconds(duration=key.budget_duration)
|
||||||
|
key.budget_reset_at = now + timedelta(seconds=duration_s)
|
||||||
|
|
||||||
|
await prisma_client.update_data(
|
||||||
|
query_type="update_many", data_list=keys_to_reset, table_name="key"
|
||||||
|
)
|
||||||
|
|
||||||
|
### RESET USER BUDGET ###
|
||||||
|
now = datetime.utcnow()
|
||||||
|
users_to_reset = await prisma_client.get_data(
|
||||||
|
table_name="user", query_type="find_all", reset_at=now
|
||||||
|
)
|
||||||
|
|
||||||
|
verbose_proxy_logger.debug(f"users_to_reset from get_data: {users_to_reset}")
|
||||||
|
|
||||||
|
if users_to_reset is not None and len(users_to_reset) > 0:
|
||||||
|
for user in users_to_reset:
|
||||||
|
user.spend = 0.0
|
||||||
|
duration_s = _duration_in_seconds(duration=user.budget_duration)
|
||||||
|
user.budget_reset_at = now + timedelta(seconds=duration_s)
|
||||||
|
|
||||||
|
await prisma_client.update_data(
|
||||||
|
query_type="update_many", data_list=users_to_reset, table_name="user"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# LiteLLM Admin UI - Non SSO Login
|
||||||
|
html_form = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>LiteLLM Login</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
height: 100vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
form {
|
||||||
|
background-color: #fff;
|
||||||
|
padding: 20px;
|
||||||
|
border-radius: 8px;
|
||||||
|
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
label {
|
||||||
|
display: block;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
input {
|
||||||
|
width: 100%;
|
||||||
|
padding: 8px;
|
||||||
|
margin-bottom: 16px;
|
||||||
|
box-sizing: border-box;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
input[type="submit"] {
|
||||||
|
background-color: #4caf50;
|
||||||
|
color: #fff;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
input[type="submit"]:hover {
|
||||||
|
background-color: #45a049;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<form action="/login" method="post">
|
||||||
|
<h2>LiteLLM Login</h2>
|
||||||
|
<label for="username">Username:</label>
|
||||||
|
<input type="text" id="username" name="username" required>
|
||||||
|
<label for="password">Password:</label>
|
||||||
|
<input type="password" id="password" name="password" required>
|
||||||
|
<input type="submit" value="Submit">
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
|
@ -94,11 +94,15 @@ class Router:
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
default_litellm_params={}, # default params for Router.chat.completion.create
|
||||||
set_verbose: bool = False,
|
set_verbose: bool = False,
|
||||||
|
debug_level: Literal["DEBUG", "INFO"] = "INFO",
|
||||||
fallbacks: List = [],
|
fallbacks: List = [],
|
||||||
allowed_fails: Optional[int] = None,
|
|
||||||
context_window_fallbacks: List = [],
|
context_window_fallbacks: List = [],
|
||||||
model_group_alias: Optional[dict] = {},
|
model_group_alias: Optional[dict] = {},
|
||||||
retry_after: int = 0, # min time to wait before retrying a failed request
|
retry_after: int = 0, # min time to wait before retrying a failed request
|
||||||
|
allowed_fails: Optional[
|
||||||
|
int
|
||||||
|
] = None, # Number of times a deployment can failbefore being added to cooldown
|
||||||
|
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
|
@ -107,7 +111,42 @@ class Router:
|
||||||
] = "simple-shuffle",
|
] = "simple-shuffle",
|
||||||
routing_strategy_args: dict = {}, # just for latency-based routing
|
routing_strategy_args: dict = {}, # just for latency-based routing
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_list (Optional[list]): List of models to be used. Defaults to None.
|
||||||
|
redis_url (Optional[str]): URL of the Redis server. Defaults to None.
|
||||||
|
redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
|
||||||
|
redis_port (Optional[int]): Port of the Redis server. Defaults to None.
|
||||||
|
redis_password (Optional[str]): Password of the Redis server. Defaults to None.
|
||||||
|
cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
|
||||||
|
cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
|
||||||
|
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
|
||||||
|
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
|
||||||
|
num_retries (int): Number of retries for failed requests. Defaults to 0.
|
||||||
|
timeout (Optional[float]): Timeout for requests. Defaults to None.
|
||||||
|
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
|
||||||
|
set_verbose (bool): Flag to set verbose mode. Defaults to False.
|
||||||
|
debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
|
||||||
|
fallbacks (List): List of fallback options. Defaults to [].
|
||||||
|
context_window_fallbacks (List): List of context window fallback options. Defaults to [].
|
||||||
|
model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
|
||||||
|
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
|
||||||
|
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
|
||||||
|
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
|
||||||
|
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
|
||||||
|
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Router: An instance of the litellm.Router class.
|
||||||
|
"""
|
||||||
self.set_verbose = set_verbose
|
self.set_verbose = set_verbose
|
||||||
|
if self.set_verbose:
|
||||||
|
if debug_level == "INFO":
|
||||||
|
verbose_router_logger.setLevel(logging.INFO)
|
||||||
|
elif debug_level == "DEBUG":
|
||||||
|
verbose_router_logger.setLevel(logging.DEBUG)
|
||||||
self.deployment_names: List = (
|
self.deployment_names: List = (
|
||||||
[]
|
[]
|
||||||
) # names of models under litellm_params. ex. azure/chatgpt-v-2
|
) # names of models under litellm_params. ex. azure/chatgpt-v-2
|
||||||
|
@ -157,6 +196,7 @@ class Router:
|
||||||
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
|
self.deployment_latency_map[m["litellm_params"]["model"]] = 0
|
||||||
|
|
||||||
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
||||||
|
self.cooldown_time = cooldown_time or 1
|
||||||
self.failed_calls = (
|
self.failed_calls = (
|
||||||
InMemoryCache()
|
InMemoryCache()
|
||||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||||
|
@ -249,16 +289,13 @@ class Router:
|
||||||
timeout = kwargs.get("request_timeout", self.timeout)
|
timeout = kwargs.get("request_timeout", self.timeout)
|
||||||
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
|
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
|
||||||
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
response = self.function_with_fallbacks(**kwargs)
|
||||||
# Submit the function to the executor with a timeout
|
|
||||||
future = executor.submit(self.function_with_fallbacks, **kwargs)
|
|
||||||
response = future.result(timeout=timeout) # type: ignore
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
||||||
|
model_name = None
|
||||||
try:
|
try:
|
||||||
# pick the one that is available (lowest TPM/RPM)
|
# pick the one that is available (lowest TPM/RPM)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -271,6 +308,7 @@ class Router:
|
||||||
)
|
)
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
kwargs["model_info"] = deployment.get("model_info", {})
|
kwargs["model_info"] = deployment.get("model_info", {})
|
||||||
|
model_name = data["model"]
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
if (
|
if (
|
||||||
k not in kwargs
|
k not in kwargs
|
||||||
|
@ -292,7 +330,7 @@ class Router:
|
||||||
else:
|
else:
|
||||||
model_client = potential_model_client
|
model_client = potential_model_client
|
||||||
|
|
||||||
return litellm.completion(
|
response = litellm.completion(
|
||||||
**{
|
**{
|
||||||
**data,
|
**data,
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
|
@ -301,7 +339,14 @@ class Router:
|
||||||
**kwargs,
|
**kwargs,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.completion(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
||||||
|
@ -830,6 +875,9 @@ class Router:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
kwargs["model"] = mg
|
kwargs["model"] = mg
|
||||||
|
kwargs.setdefault("metadata", {}).update(
|
||||||
|
{"model_group": mg}
|
||||||
|
) # update model_group used, if fallbacks are done
|
||||||
response = await self.async_function_with_retries(
|
response = await self.async_function_with_retries(
|
||||||
*args, **kwargs
|
*args, **kwargs
|
||||||
)
|
)
|
||||||
|
@ -858,8 +906,10 @@ class Router:
|
||||||
f"Falling back to model_group = {mg}"
|
f"Falling back to model_group = {mg}"
|
||||||
)
|
)
|
||||||
kwargs["model"] = mg
|
kwargs["model"] = mg
|
||||||
kwargs["metadata"]["model_group"] = mg
|
kwargs.setdefault("metadata", {}).update(
|
||||||
response = await self.async_function_with_retries(
|
{"model_group": mg}
|
||||||
|
) # update model_group used, if fallbacks are done
|
||||||
|
response = await self.async_function_with_fallbacks(
|
||||||
*args, **kwargs
|
*args, **kwargs
|
||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
|
@ -1024,6 +1074,9 @@ class Router:
|
||||||
## LOGGING
|
## LOGGING
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
kwargs["model"] = mg
|
kwargs["model"] = mg
|
||||||
|
kwargs.setdefault("metadata", {}).update(
|
||||||
|
{"model_group": mg}
|
||||||
|
) # update model_group used, if fallbacks are done
|
||||||
response = self.function_with_fallbacks(*args, **kwargs)
|
response = self.function_with_fallbacks(*args, **kwargs)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -1047,6 +1100,9 @@ class Router:
|
||||||
## LOGGING
|
## LOGGING
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
kwargs["model"] = mg
|
kwargs["model"] = mg
|
||||||
|
kwargs.setdefault("metadata", {}).update(
|
||||||
|
{"model_group": mg}
|
||||||
|
) # update model_group used, if fallbacks are done
|
||||||
response = self.function_with_fallbacks(*args, **kwargs)
|
response = self.function_with_fallbacks(*args, **kwargs)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -1232,6 +1288,7 @@ class Router:
|
||||||
verbose_router_logger.debug(
|
verbose_router_logger.debug(
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
||||||
)
|
)
|
||||||
|
cooldown_time = self.cooldown_time or 1
|
||||||
if updated_fails > self.allowed_fails:
|
if updated_fails > self.allowed_fails:
|
||||||
# get the current cooldown list for that minute
|
# get the current cooldown list for that minute
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
|
@ -1245,13 +1302,19 @@ class Router:
|
||||||
else:
|
else:
|
||||||
cached_value = cached_value + [deployment]
|
cached_value = cached_value + [deployment]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
|
self.cache.set_cache(
|
||||||
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
cached_value = [deployment]
|
cached_value = [deployment]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
|
self.cache.set_cache(
|
||||||
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1)
|
self.failed_calls.set_cache(
|
||||||
|
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||||
|
)
|
||||||
|
|
||||||
def _get_cooldown_deployments(self):
|
def _get_cooldown_deployments(self):
|
||||||
"""
|
"""
|
||||||
|
@ -1344,6 +1407,7 @@ class Router:
|
||||||
max_retries = litellm.get_secret(max_retries_env_name)
|
max_retries = litellm.get_secret(max_retries_env_name)
|
||||||
litellm_params["max_retries"] = max_retries
|
litellm_params["max_retries"] = max_retries
|
||||||
|
|
||||||
|
|
||||||
# proxy support
|
# proxy support
|
||||||
import os
|
import os
|
||||||
import httpx
|
import httpx
|
||||||
|
@ -1369,6 +1433,12 @@ class Router:
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
organization = litellm_params.get("organization", None)
|
||||||
|
if isinstance(organization, str) and organization.startswith("os.environ/"):
|
||||||
|
organization_env_name = organization.replace("os.environ/", "")
|
||||||
|
organization = litellm.get_secret(organization_env_name)
|
||||||
|
litellm_params["organization"] = organization
|
||||||
|
|
||||||
if "azure" in model_name:
|
if "azure" in model_name:
|
||||||
if api_base is None:
|
if api_base is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1576,6 +1646,7 @@ class Router:
|
||||||
base_url=api_base,
|
base_url=api_base,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(),
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
|
@ -1597,6 +1668,7 @@ class Router:
|
||||||
base_url=api_base,
|
base_url=api_base,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(),
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
|
@ -1619,6 +1691,7 @@ class Router:
|
||||||
base_url=api_base,
|
base_url=api_base,
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(),
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
|
@ -1641,6 +1714,7 @@ class Router:
|
||||||
base_url=api_base,
|
base_url=api_base,
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
|
organization=organization,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(),
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
|
@ -1865,6 +1939,9 @@ class Router:
|
||||||
selected_index = random.choices(range(len(rpms)), weights=weights)[0]
|
selected_index = random.choices(range(len(rpms)), weights=weights)[0]
|
||||||
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
||||||
deployment = healthy_deployments[selected_index]
|
deployment = healthy_deployments[selected_index]
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
|
||||||
|
)
|
||||||
return deployment or deployment[0]
|
return deployment or deployment[0]
|
||||||
############## Check if we can do a RPM/TPM based weighted pick #################
|
############## Check if we can do a RPM/TPM based weighted pick #################
|
||||||
tpm = healthy_deployments[0].get("litellm_params").get("tpm", None)
|
tpm = healthy_deployments[0].get("litellm_params").get("tpm", None)
|
||||||
|
@ -1879,6 +1956,9 @@ class Router:
|
||||||
selected_index = random.choices(range(len(tpms)), weights=weights)[0]
|
selected_index = random.choices(range(len(tpms)), weights=weights)[0]
|
||||||
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
||||||
deployment = healthy_deployments[selected_index]
|
deployment = healthy_deployments[selected_index]
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
|
||||||
|
)
|
||||||
return deployment or deployment[0]
|
return deployment or deployment[0]
|
||||||
|
|
||||||
############## No RPM/TPM passed, we do a random pick #################
|
############## No RPM/TPM passed, we do a random pick #################
|
||||||
|
@ -1903,8 +1983,13 @@ class Router:
|
||||||
)
|
)
|
||||||
|
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"get_available_deployment for model: {model}, No deployment available"
|
||||||
|
)
|
||||||
raise ValueError("No models available.")
|
raise ValueError("No models available.")
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"get_available_deployment for model: {model}, Selected deployment: {deployment} for model: {model}"
|
||||||
|
)
|
||||||
return deployment
|
return deployment
|
||||||
|
|
||||||
def flush_cache(self):
|
def flush_cache(self):
|
||||||
|
|
|
@ -10,6 +10,7 @@ import traceback
|
||||||
from litellm import token_counter
|
from litellm import token_counter
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from litellm._logging import verbose_router_logger
|
||||||
|
|
||||||
|
|
||||||
class LowestTPMLoggingHandler(CustomLogger):
|
class LowestTPMLoggingHandler(CustomLogger):
|
||||||
|
@ -130,6 +131,9 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
Returns a deployment with the lowest TPM/RPM usage.
|
Returns a deployment with the lowest TPM/RPM usage.
|
||||||
"""
|
"""
|
||||||
# get list of potential deployments
|
# get list of potential deployments
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
|
||||||
|
)
|
||||||
current_minute = datetime.now().strftime("%H-%M")
|
current_minute = datetime.now().strftime("%H-%M")
|
||||||
tpm_key = f"{model_group}:tpm:{current_minute}"
|
tpm_key = f"{model_group}:tpm:{current_minute}"
|
||||||
rpm_key = f"{model_group}:rpm:{current_minute}"
|
rpm_key = f"{model_group}:rpm:{current_minute}"
|
||||||
|
@ -137,14 +141,31 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
tpm_dict = self.router_cache.get_cache(key=tpm_key)
|
tpm_dict = self.router_cache.get_cache(key=tpm_key)
|
||||||
rpm_dict = self.router_cache.get_cache(key=rpm_key)
|
rpm_dict = self.router_cache.get_cache(key=rpm_key)
|
||||||
|
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
f"tpm_key={tpm_key}, tpm_dict: {tpm_dict}, rpm_dict: {rpm_dict}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
input_tokens = token_counter(messages=messages, text=input)
|
||||||
|
except:
|
||||||
|
input_tokens = 0
|
||||||
# -----------------------
|
# -----------------------
|
||||||
# Find lowest used model
|
# Find lowest used model
|
||||||
# ----------------------
|
# ----------------------
|
||||||
lowest_tpm = float("inf")
|
lowest_tpm = float("inf")
|
||||||
deployment = None
|
deployment = None
|
||||||
if tpm_dict is None: # base case
|
if tpm_dict is None: # base case - none of the deployments have been used
|
||||||
item = random.choice(healthy_deployments)
|
# Return the 1st deployment where deployment["tpm"] >= input_tokens
|
||||||
return item
|
for deployment in healthy_deployments:
|
||||||
|
_deployment_tpm = (
|
||||||
|
deployment.get("tpm", None)
|
||||||
|
or deployment.get("litellm_params", {}).get("tpm", None)
|
||||||
|
or deployment.get("model_info", {}).get("tpm", None)
|
||||||
|
or float("inf")
|
||||||
|
)
|
||||||
|
|
||||||
|
if _deployment_tpm >= input_tokens:
|
||||||
|
return deployment
|
||||||
|
return None
|
||||||
|
|
||||||
all_deployments = tpm_dict
|
all_deployments = tpm_dict
|
||||||
for d in healthy_deployments:
|
for d in healthy_deployments:
|
||||||
|
@ -152,11 +173,6 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
if d["model_info"]["id"] not in all_deployments:
|
if d["model_info"]["id"] not in all_deployments:
|
||||||
all_deployments[d["model_info"]["id"]] = 0
|
all_deployments[d["model_info"]["id"]] = 0
|
||||||
|
|
||||||
try:
|
|
||||||
input_tokens = token_counter(messages=messages, text=input)
|
|
||||||
except:
|
|
||||||
input_tokens = 0
|
|
||||||
|
|
||||||
for item, item_tpm in all_deployments.items():
|
for item, item_tpm in all_deployments.items():
|
||||||
## get the item from model list
|
## get the item from model list
|
||||||
_deployment = None
|
_deployment = None
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
Starting new HTTPS connection (1): api.anthropic.com:443
|
|
||||||
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
|
|
||||||
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
|
|
||||||
https://api.anthropic.com:443 "POST /v1/complete HTTP/1.1" 200 None
|
|
||||||
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
|
|
||||||
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'this is a streaming test for llama2 + langfuse'}], 'model': 'gpt-3.5-turbo', 'max_tokens': 20, 'stream': True, 'temperature': 0.2}}
|
|
||||||
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
|
|
||||||
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f92d0>
|
|
||||||
start_tls.started ssl_context=<ssl.SSLContext object at 0x108ddf020> server_hostname='api.openai.com' timeout=600.0
|
|
||||||
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f9290>
|
|
||||||
send_request_headers.started request=<Request [b'POST']>
|
|
||||||
send_request_headers.complete
|
|
||||||
send_request_body.started request=<Request [b'POST']>
|
|
||||||
send_request_body.complete
|
|
||||||
receive_response_headers.started request=<Request [b'POST']>
|
|
||||||
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
|
|
||||||
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:00 GMT'), (b'Content-Type', b'text/event-stream'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-0613'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'62'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999967'), (b'x-ratelimit-remaining-tokens_usage_based', b'999967'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'dd1029a85edecb986fb662945c9f7b4f'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=dnuSnc6BPNJd4lgWKpv3iE2P5zy4r5aCVekXVi7HG7U-1703313180-1-AbeMpAfvmJ6BShULb7tMaErR5ergUrt6ohiXj1e8zoo9AotZ0Jz0alUSUcp8FXyQX2VQ9P6gBUeoSR9aE98OasU=; path=/; expires=Sat, 23-Dec-23 07:03:00 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=dET0GKSNfbtSWNJuXndP8GY8M0ANzDK4Dl7mvIfhmM0-1703313180257-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e920e4f47f4b0-BOM'), (b'alt-svc', b'h3=":443"; ma=86400')])
|
|
||||||
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
|
|
||||||
receive_response_body.started request=<Request [b'POST']>
|
|
||||||
receive_response_body.complete
|
|
||||||
response_closed.started
|
|
||||||
response_closed.complete
|
|
||||||
Starting new HTTPS connection (1): litellm-logging.onrender.com:443
|
|
||||||
Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': "What's the weather like in San Francisco, Tokyo, and Paris?"}], 'model': 'gpt-3.5-turbo-1106', 'tool_choice': 'auto', 'tools': [{'type': 'function', 'function': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
|
|
||||||
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
|
|
||||||
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
|
|
||||||
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
|
|
||||||
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
|
|
||||||
send_request_headers.started request=<Request [b'POST']>
|
|
||||||
send_request_headers.complete
|
|
||||||
send_request_body.started request=<Request [b'POST']>
|
|
||||||
send_request_body.complete
|
|
||||||
receive_response_headers.started request=<Request [b'POST']>
|
|
||||||
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
|
|
||||||
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
|
|
||||||
receive_response_body.started request=<Request [b'POST']>
|
|
||||||
receive_response_body.complete
|
|
||||||
response_closed.started
|
|
||||||
response_closed.complete
|
|
||||||
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
|
|
||||||
nction': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
|
|
||||||
connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
|
|
||||||
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
|
|
||||||
start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
|
|
||||||
start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
|
|
||||||
send_request_headers.started request=<Request [b'POST']>
|
|
||||||
send_request_headers.complete
|
|
||||||
send_request_body.started request=<Request [b'POST']>
|
|
||||||
send_request_body.complete
|
|
||||||
receive_response_headers.started request=<Request [b'POST']>
|
|
||||||
https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
|
|
||||||
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
|
|
||||||
receive_response_body.started request=<Request [b'POST']>
|
|
||||||
receive_response_body.complete
|
|
||||||
response_closed.started
|
|
||||||
response_closed.complete
|
|
||||||
HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
|
|