forked from phoenix/litellm-mirror
Merge branch 'main' into main
This commit is contained in:
commit
7c38f992dc
160 changed files with 7414 additions and 1644 deletions
|
@ -45,6 +45,8 @@ jobs:
|
||||||
pip install "asyncio==3.4.3"
|
pip install "asyncio==3.4.3"
|
||||||
pip install "apscheduler==3.10.4"
|
pip install "apscheduler==3.10.4"
|
||||||
pip install "PyGithub==1.59.1"
|
pip install "PyGithub==1.59.1"
|
||||||
|
pip install argon2-cffi
|
||||||
|
pip install python-multipart
|
||||||
- save_cache:
|
- save_cache:
|
||||||
paths:
|
paths:
|
||||||
- ./venv
|
- ./venv
|
||||||
|
@ -88,6 +90,32 @@ jobs:
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: test-results
|
path: test-results
|
||||||
|
|
||||||
|
installing_litellm_on_python:
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.8
|
||||||
|
working_directory: ~/project
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run:
|
||||||
|
name: Install Dependencies
|
||||||
|
command: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install python-dotenv
|
||||||
|
pip install pytest
|
||||||
|
pip install tiktoken
|
||||||
|
pip install aiohttp
|
||||||
|
pip install click
|
||||||
|
pip install jinja2
|
||||||
|
pip install tokenizers
|
||||||
|
pip install openai
|
||||||
|
- run:
|
||||||
|
name: Run tests
|
||||||
|
command: |
|
||||||
|
pwd
|
||||||
|
ls
|
||||||
|
python -m pytest -vv litellm/tests/test_python_38.py
|
||||||
|
|
||||||
build_and_test:
|
build_and_test:
|
||||||
machine:
|
machine:
|
||||||
image: ubuntu-2204:2023.10.1
|
image: ubuntu-2204:2023.10.1
|
||||||
|
@ -276,6 +304,12 @@ workflows:
|
||||||
only:
|
only:
|
||||||
- main
|
- main
|
||||||
- /litellm_.*/
|
- /litellm_.*/
|
||||||
|
- installing_litellm_on_python:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- main
|
||||||
|
- /litellm_.*/
|
||||||
- publish_to_pypi:
|
- publish_to_pypi:
|
||||||
requires:
|
requires:
|
||||||
- local_testing
|
- local_testing
|
||||||
|
|
24
.github/workflows/ghcr_deploy.yml
vendored
24
.github/workflows/ghcr_deploy.yml
vendored
|
@ -146,9 +146,29 @@ jobs:
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
core.setFailed(error.message);
|
core.setFailed(error.message);
|
||||||
}
|
}
|
||||||
|
- name: Fetch Release Notes
|
||||||
|
id: release-notes
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||||
|
script: |
|
||||||
|
try {
|
||||||
|
const response = await github.rest.repos.getRelease({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
release_id: process.env.RELEASE_ID,
|
||||||
|
});
|
||||||
|
return response.data.body;
|
||||||
|
} catch (error) {
|
||||||
|
core.setFailed(error.message);
|
||||||
|
}
|
||||||
|
env:
|
||||||
|
RELEASE_ID: ${{ env.RELEASE_ID }}
|
||||||
- name: Github Releases To Discord
|
- name: Github Releases To Discord
|
||||||
env:
|
env:
|
||||||
WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }}
|
WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }}
|
||||||
|
REALEASE_TAG: ${{ env.RELEASE_TAG }}
|
||||||
|
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
|
||||||
run: |
|
run: |
|
||||||
curl -H "Content-Type: application/json" -X POST -d '{
|
curl -H "Content-Type: application/json" -X POST -d '{
|
||||||
"content": "||@everyone||",
|
"content": "||@everyone||",
|
||||||
|
@ -156,8 +176,8 @@ jobs:
|
||||||
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
|
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
|
||||||
"embeds": [
|
"embeds": [
|
||||||
{
|
{
|
||||||
"title": "Changelog",
|
"title": "Changelog for ${RELEASE_TAG}",
|
||||||
"description": "This is the changelog for the latest release.",
|
"description": "${RELEASE_NOTES}",
|
||||||
"color": 2105893
|
"color": 2105893
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -44,3 +44,4 @@ deploy/charts/litellm/*.tgz
|
||||||
deploy/charts/litellm/charts/*
|
deploy/charts/litellm/charts/*
|
||||||
deploy/charts/*.tgz
|
deploy/charts/*.tgz
|
||||||
litellm/proxy/vertex_key.json
|
litellm/proxy/vertex_key.json
|
||||||
|
**/.vim/
|
||||||
|
|
|
@ -61,4 +61,7 @@ RUN chmod +x entrypoint.sh
|
||||||
EXPOSE 4000/tcp
|
EXPOSE 4000/tcp
|
||||||
|
|
||||||
ENTRYPOINT ["litellm"]
|
ENTRYPOINT ["litellm"]
|
||||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
|
|
||||||
|
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
||||||
|
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
|
||||||
|
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn"]
|
|
@ -65,4 +65,7 @@ EXPOSE 4000/tcp
|
||||||
# # Set your entrypoint and command
|
# # Set your entrypoint and command
|
||||||
|
|
||||||
ENTRYPOINT ["litellm"]
|
ENTRYPOINT ["litellm"]
|
||||||
|
|
||||||
|
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
||||||
|
# CMD ["--port", "4000","--run_gunicorn", "--detailed_debug"]
|
||||||
CMD ["--port", "4000", "--run_gunicorn"]
|
CMD ["--port", "4000", "--run_gunicorn"]
|
||||||
|
|
19
README.md
19
README.md
|
@ -148,14 +148,14 @@ pip install 'litellm[proxy]'
|
||||||
```shell
|
```shell
|
||||||
$ litellm --model huggingface/bigcode/starcoder
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2: Make ChatCompletions Request to Proxy
|
### Step 2: Make ChatCompletions Request to Proxy
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import openai # openai v1.0.0+
|
import openai # openai v1.0.0+
|
||||||
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
|
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||||
{
|
{
|
||||||
|
@ -178,7 +178,7 @@ Set budgets and rate limits across multiple projects
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
|
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
|
||||||
|
@ -259,6 +259,19 @@ Step 4: Submit a PR with your changes! 🚀
|
||||||
- push your fork to your GitHub repo
|
- push your fork to your GitHub repo
|
||||||
- submit a PR from there
|
- submit a PR from there
|
||||||
|
|
||||||
|
# Enterprise
|
||||||
|
For companies that need better security, user management and professional support
|
||||||
|
|
||||||
|
[Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
|
||||||
|
|
||||||
|
This covers:
|
||||||
|
- ✅ **Features under the [LiteLLM Commercial License](https://docs.litellm.ai/docs/proxy/enterprise):**
|
||||||
|
- ✅ **Feature Prioritization**
|
||||||
|
- ✅ **Custom Integrations**
|
||||||
|
- ✅ **Professional Support - Dedicated discord + slack**
|
||||||
|
- ✅ **Custom SLAs**
|
||||||
|
- ✅ **Secure access with Single Sign-On**
|
||||||
|
|
||||||
# Support / talk with founders
|
# Support / talk with founders
|
||||||
|
|
||||||
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
|
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
from fastapi import FastAPI
|
||||||
|
import uvicorn
|
||||||
|
from memory_profiler import profile, memory_usage
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import asyncio
|
||||||
|
import pytest
|
||||||
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from collections import defaultdict
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 240000,
|
||||||
|
"rpm": 1800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "text-embedding-ada-002",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/azure-embedding-model",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 10000,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
litellm.cache = litellm.Cache(
|
||||||
|
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
|
||||||
|
)
|
||||||
|
router = Router(model_list=model_list, set_verbose=True)
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def read_root():
|
||||||
|
return {"message": "Welcome to the FastAPI endpoint!"}
|
||||||
|
|
||||||
|
|
||||||
|
@profile
|
||||||
|
@app.post("/router_acompletion")
|
||||||
|
async def router_acompletion():
|
||||||
|
question = f"This is a test: {uuid.uuid4()}" * 100
|
||||||
|
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
|
||||||
|
print("embedding-resp", resp)
|
||||||
|
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
|
||||||
|
)
|
||||||
|
print("completion-resp", response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
@ -0,0 +1,92 @@
|
||||||
|
#### What this tests ####
|
||||||
|
|
||||||
|
from memory_profiler import profile, memory_usage
|
||||||
|
import sys, os, time
|
||||||
|
import traceback, asyncio
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from collections import defaultdict
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 240000,
|
||||||
|
"rpm": 1800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "text-embedding-ada-002",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/azure-embedding-model",
|
||||||
|
"api_key": os.environ["AZURE_API_KEY"],
|
||||||
|
"api_base": os.environ["AZURE_API_BASE"],
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 10000,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
litellm.set_verbose = True
|
||||||
|
litellm.cache = litellm.Cache(
|
||||||
|
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
|
||||||
|
)
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
@profile
|
||||||
|
async def router_acompletion():
|
||||||
|
# embedding call
|
||||||
|
question = f"This is a test: {uuid.uuid4()}" * 100
|
||||||
|
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
|
||||||
|
print("embedding-resp", resp)
|
||||||
|
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
|
||||||
|
)
|
||||||
|
print("completion-resp", response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
for i in range(1):
|
||||||
|
start = time.time()
|
||||||
|
n = 50 # Number of concurrent tasks
|
||||||
|
tasks = [router_acompletion() for _ in range(n)]
|
||||||
|
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
|
||||||
|
# Write errors to error_log.txt
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
for completion in chat_completions:
|
||||||
|
if isinstance(completion, str):
|
||||||
|
error_log.write(completion + "\n")
|
||||||
|
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Blank out contents of error_log.txt
|
||||||
|
open("error_log.txt", "w").close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,92 @@
|
||||||
|
#### What this tests ####
|
||||||
|
|
||||||
|
from memory_profiler import profile, memory_usage
|
||||||
|
import sys, os, time
|
||||||
|
import traceback, asyncio
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from collections import defaultdict
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 240000,
|
||||||
|
"rpm": 1800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "text-embedding-ada-002",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/azure-embedding-model",
|
||||||
|
"api_key": os.environ["AZURE_API_KEY"],
|
||||||
|
"api_base": os.environ["AZURE_API_BASE"],
|
||||||
|
},
|
||||||
|
"tpm": 100000,
|
||||||
|
"rpm": 10000,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
litellm.set_verbose = True
|
||||||
|
litellm.cache = litellm.Cache(
|
||||||
|
type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
|
||||||
|
)
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
set_verbose=True,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
@profile
|
||||||
|
async def router_acompletion():
|
||||||
|
# embedding call
|
||||||
|
question = f"This is a test: {uuid.uuid4()}" * 100
|
||||||
|
resp = await router.aembedding(model="text-embedding-ada-002", input=question)
|
||||||
|
print("embedding-resp", resp)
|
||||||
|
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo", messages=[{"role": "user", "content": question}]
|
||||||
|
)
|
||||||
|
print("completion-resp", response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
for i in range(1):
|
||||||
|
start = time.time()
|
||||||
|
n = 50 # Number of concurrent tasks
|
||||||
|
tasks = [router_acompletion() for _ in range(n)]
|
||||||
|
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
|
||||||
|
# Write errors to error_log.txt
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
for completion in chat_completions:
|
||||||
|
if isinstance(completion, str):
|
||||||
|
error_log.write(completion + "\n")
|
||||||
|
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Blank out contents of error_log.txt
|
||||||
|
open("error_log.txt", "w").close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,28 @@
|
||||||
|
import requests
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
# Replace the URL with your actual endpoint
|
||||||
|
url = "http://localhost:8000/router_acompletion"
|
||||||
|
|
||||||
|
|
||||||
|
def make_request(session):
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
data = {} # Replace with your JSON payload if needed
|
||||||
|
|
||||||
|
response = session.post(url, headers=headers, json=data)
|
||||||
|
print(f"Status code: {response.status_code}")
|
||||||
|
|
||||||
|
|
||||||
|
# Number of concurrent requests
|
||||||
|
num_requests = 20
|
||||||
|
|
||||||
|
# Create a session to reuse the underlying TCP connection
|
||||||
|
with requests.Session() as session:
|
||||||
|
# Use ThreadPoolExecutor for concurrent requests
|
||||||
|
with ThreadPoolExecutor(max_workers=num_requests) as executor:
|
||||||
|
# Use list comprehension to submit tasks
|
||||||
|
futures = [executor.submit(make_request, session) for _ in range(num_requests)]
|
||||||
|
|
||||||
|
# Wait for all futures to complete
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
|
@ -1,6 +1,9 @@
|
||||||
dependencies:
|
dependencies:
|
||||||
- name: postgresql
|
- name: postgresql
|
||||||
repository: oci://registry-1.docker.io/bitnamicharts
|
repository: oci://registry-1.docker.io/bitnamicharts
|
||||||
version: 13.3.1
|
version: 14.3.1
|
||||||
digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
|
- name: redis
|
||||||
generated: "2024-01-19T11:32:56.694808861+11:00"
|
repository: oci://registry-1.docker.io/bitnamicharts
|
||||||
|
version: 18.19.1
|
||||||
|
digest: sha256:8660fe6287f9941d08c0902f3f13731079b8cecd2a5da2fbc54e5b7aae4a6f62
|
||||||
|
generated: "2024-03-10T02:28:52.275022+05:30"
|
||||||
|
|
|
@ -31,3 +31,7 @@ dependencies:
|
||||||
version: ">=13.3.0"
|
version: ">=13.3.0"
|
||||||
repository: oci://registry-1.docker.io/bitnamicharts
|
repository: oci://registry-1.docker.io/bitnamicharts
|
||||||
condition: db.deployStandalone
|
condition: db.deployStandalone
|
||||||
|
- name: redis
|
||||||
|
version: ">=18.0.0"
|
||||||
|
repository: oci://registry-1.docker.io/bitnamicharts
|
||||||
|
condition: redis.enabled
|
||||||
|
|
|
@ -28,7 +28,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
|
||||||
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
|
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
|
||||||
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
|
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
|
||||||
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
|
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
|
||||||
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` |
|
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` |
|
||||||
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
|
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
|
||||||
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
|
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ When browsing to the URL published per the settings in `ingress.*`, you will
|
||||||
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
|
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
|
||||||
(from the `litellm` pod's perspective) URL published by the `<RELEASE>-litellm`
|
(from the `litellm` pod's perspective) URL published by the `<RELEASE>-litellm`
|
||||||
Kubernetes Service. If the deployment uses the default settings for this
|
Kubernetes Service. If the deployment uses the default settings for this
|
||||||
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:8000`.
|
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:4000`.
|
||||||
|
|
||||||
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
|
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
|
||||||
was not provided to the helm command line, the `masterkey` is a randomly
|
was not provided to the helm command line, the `masterkey` is a randomly
|
||||||
|
|
|
@ -60,3 +60,25 @@ Create the name of the service account to use
|
||||||
{{- default "default" .Values.serviceAccount.name }}
|
{{- default "default" .Values.serviceAccount.name }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Get redis service name
|
||||||
|
*/}}
|
||||||
|
{{- define "litellm.redis.serviceName" -}}
|
||||||
|
{{- if and (eq .Values.redis.architecture "standalone") .Values.redis.sentinel.enabled -}}
|
||||||
|
{{- printf "%s-%s" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
|
||||||
|
{{- else -}}
|
||||||
|
{{- printf "%s-%s-master" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Get redis service port
|
||||||
|
*/}}
|
||||||
|
{{- define "litellm.redis.port" -}}
|
||||||
|
{{- if .Values.redis.sentinel.enabled -}}
|
||||||
|
{{ .Values.redis.sentinel.service.ports.sentinel }}
|
||||||
|
{{- else -}}
|
||||||
|
{{ .Values.redis.master.service.ports.redis }}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
|
@ -142,6 +142,17 @@ spec:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: {{ include "litellm.fullname" . }}-masterkey
|
name: {{ include "litellm.fullname" . }}-masterkey
|
||||||
key: masterkey
|
key: masterkey
|
||||||
|
{{- if .Values.redis.enabled }}
|
||||||
|
- name: REDIS_HOST
|
||||||
|
value: {{ include "litellm.redis.serviceName" . }}
|
||||||
|
- name: REDIS_PORT
|
||||||
|
value: {{ include "litellm.redis.port" . | quote }}
|
||||||
|
- name: REDIS_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ include "redis.secretName" .Subcharts.redis }}
|
||||||
|
key: {{include "redis.secretPasswordKey" .Subcharts.redis }}
|
||||||
|
{{- end }}
|
||||||
envFrom:
|
envFrom:
|
||||||
{{- range .Values.environmentSecrets }}
|
{{- range .Values.environmentSecrets }}
|
||||||
- secretRef:
|
- secretRef:
|
||||||
|
|
|
@ -55,7 +55,7 @@ environmentSecrets: []
|
||||||
|
|
||||||
service:
|
service:
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
port: 8000
|
port: 4000
|
||||||
|
|
||||||
ingress:
|
ingress:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
@ -87,6 +87,8 @@ proxy_config:
|
||||||
api_key: eXaMpLeOnLy
|
api_key: eXaMpLeOnLy
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: os.environ/PROXY_MASTER_KEY
|
master_key: os.environ/PROXY_MASTER_KEY
|
||||||
|
# litellm_settings:
|
||||||
|
# cache: true
|
||||||
|
|
||||||
resources: {}
|
resources: {}
|
||||||
# We usually recommend not to specify default resources and to leave this as a conscious
|
# We usually recommend not to specify default resources and to leave this as a conscious
|
||||||
|
@ -166,3 +168,10 @@ postgresql:
|
||||||
# existingSecret: ""
|
# existingSecret: ""
|
||||||
# secretKeys:
|
# secretKeys:
|
||||||
# userPasswordKey: password
|
# userPasswordKey: password
|
||||||
|
|
||||||
|
# requires cache: true in config file
|
||||||
|
# either enable this or pass a secret for REDIS_HOST, REDIS_PORT, REDIS_PASSWORD or REDIS_URL
|
||||||
|
# with cache: true to use existing redis instance
|
||||||
|
redis:
|
||||||
|
enabled: false
|
||||||
|
architecture: standalone
|
||||||
|
|
|
@ -8,13 +8,3 @@ services:
|
||||||
- "4000:4000"
|
- "4000:4000"
|
||||||
environment:
|
environment:
|
||||||
- AZURE_API_KEY=sk-123
|
- AZURE_API_KEY=sk-123
|
||||||
|
|
||||||
clickhouse:
|
|
||||||
image: clickhouse/clickhouse-server
|
|
||||||
environment:
|
|
||||||
- CLICKHOUSE_DB=litellm-test
|
|
||||||
- CLICKHOUSE_USER=admin
|
|
||||||
- CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
|
|
||||||
- CLICKHOUSE_PASSWORD=admin
|
|
||||||
ports:
|
|
||||||
- "8123:8123"
|
|
||||||
|
|
85
docs/my-website/docs/audio_transcription.md
Normal file
85
docs/my-website/docs/audio_transcription.md
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# Audio Transcription
|
||||||
|
|
||||||
|
Use this to loadbalance across Azure + OpenAI.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import transcription
|
||||||
|
import os
|
||||||
|
|
||||||
|
# set api keys
|
||||||
|
os.environ["OPENAI_API_KEY"] = ""
|
||||||
|
audio_file = open("/path/to/audio.mp3", "rb")
|
||||||
|
|
||||||
|
response = transcription(model="whisper", file=audio_file)
|
||||||
|
|
||||||
|
print(f"response: {response}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Proxy Usage
|
||||||
|
|
||||||
|
### Add model to config
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: whisper
|
||||||
|
litellm_params:
|
||||||
|
model: whisper-1
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
model_info:
|
||||||
|
mode: audio_transcription
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai+azure" label="OpenAI + Azure">
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: whisper
|
||||||
|
litellm_params:
|
||||||
|
model: whisper-1
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
model_info:
|
||||||
|
mode: audio_transcription
|
||||||
|
- model_name: whisper
|
||||||
|
litellm_params:
|
||||||
|
model: azure/azure-whisper
|
||||||
|
api_version: 2024-02-15-preview
|
||||||
|
api_base: os.environ/AZURE_EUROPE_API_BASE
|
||||||
|
api_key: os.environ/AZURE_EUROPE_API_KEY
|
||||||
|
model_info:
|
||||||
|
mode: audio_transcription
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
### Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
|
# RUNNING on http://0.0.0.0:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
|
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
|
||||||
|
--form 'model="whisper"'
|
||||||
|
```
|
|
@ -24,6 +24,17 @@ print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Translated OpenAI params
|
### Translated OpenAI params
|
||||||
|
|
||||||
|
Use this function to get an up-to-date list of supported openai params for any model + provider.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import get_supported_openai_params
|
||||||
|
|
||||||
|
response = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
|
||||||
|
|
||||||
|
print(response) # ["max_tokens", "tools", "tool_choice", "stream"]
|
||||||
|
```
|
||||||
|
|
||||||
This is a list of openai params we translate across providers.
|
This is a list of openai params we translate across providers.
|
||||||
|
|
||||||
This list is constantly being updated.
|
This list is constantly being updated.
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Embedding Models
|
# Embedding Models
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
@ -7,8 +10,81 @@ import os
|
||||||
os.environ['OPENAI_API_KEY'] = ""
|
os.environ['OPENAI_API_KEY'] = ""
|
||||||
response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
|
response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
|
||||||
```
|
```
|
||||||
|
## Proxy Usage
|
||||||
|
|
||||||
### Input Params for `litellm.embedding()`
|
**NOTE**
|
||||||
|
For `vertex_ai`,
|
||||||
|
```bash
|
||||||
|
export GOOGLE_APPLICATION_CREDENTIALS="absolute/path/to/service_account.json"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Add model to config
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: textembedding-gecko
|
||||||
|
litellm_params:
|
||||||
|
model: vertex_ai/textembedding-gecko
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
||||||
|
```
|
||||||
|
|
||||||
|
### Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
|
# RUNNING on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="curl" label="Curl">
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/embeddings' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{"input": ["Academia.edu uses"], "model": "textembedding-gecko", "encoding_format": "base64"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI (python)">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="sk-1234",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
client.embeddings.create(
|
||||||
|
model="textembedding-gecko",
|
||||||
|
input="The food was delicious and the waiter...",
|
||||||
|
encoding_format="float"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain Embeddings">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain_openai import OpenAIEmbeddings
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:4000", openai_api_key="sk-1234")
|
||||||
|
|
||||||
|
text = "This is a test document."
|
||||||
|
|
||||||
|
query_result = embeddings.embed_query(text)
|
||||||
|
|
||||||
|
print(f"VERTEX AI EMBEDDINGS")
|
||||||
|
print(query_result[:5])
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Input Params for `litellm.embedding()`
|
||||||
### Required Fields
|
### Required Fields
|
||||||
|
|
||||||
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
||||||
|
@ -124,7 +200,7 @@ Use this for calling `/embedding` endpoints on OpenAI Compatible Servers, exampl
|
||||||
from litellm import embedding
|
from litellm import embedding
|
||||||
response = embedding(
|
response = embedding(
|
||||||
model = "openai/<your-llm-name>", # add `openai/` prefix to model so litellm knows to route to OpenAI
|
model = "openai/<your-llm-name>", # add `openai/` prefix to model so litellm knows to route to OpenAI
|
||||||
api_base="http://0.0.0.0:8000/" # set API Base of your Custom OpenAI Endpoint
|
api_base="http://0.0.0.0:4000/" # set API Base of your Custom OpenAI Endpoint
|
||||||
input=["good morning from litellm"]
|
input=["good morning from litellm"]
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
@ -235,6 +311,35 @@ print(response)
|
||||||
| mistral-embed | `embedding(model="mistral/mistral-embed", input)` |
|
| mistral-embed | `embedding(model="mistral/mistral-embed", input)` |
|
||||||
|
|
||||||
|
|
||||||
|
## Vertex AI Embedding Models
|
||||||
|
|
||||||
|
### Usage - Embedding
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
from litellm import embedding
|
||||||
|
litellm.vertex_project = "hardy-device-38811" # Your Project ID
|
||||||
|
litellm.vertex_location = "us-central1" # proj location
|
||||||
|
|
||||||
|
|
||||||
|
os.environ['VOYAGE_API_KEY'] = ""
|
||||||
|
response = embedding(
|
||||||
|
model="vertex_ai/textembedding-gecko",
|
||||||
|
input=["good morning from litellm"],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported
|
||||||
|
|
||||||
|
| Model Name | Function Call |
|
||||||
|
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| textembedding-gecko | `embedding(model="vertex_ai/textembedding-gecko", input)` |
|
||||||
|
| textembedding-gecko-multilingual | `embedding(model="vertex_ai/textembedding-gecko-multilingual", input)` |
|
||||||
|
| textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` |
|
||||||
|
| textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` |
|
||||||
|
| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` |
|
||||||
|
|
||||||
## Voyage AI Embedding Models
|
## Voyage AI Embedding Models
|
||||||
|
|
||||||
### Usage - Embedding
|
### Usage - Embedding
|
||||||
|
|
|
@ -12,7 +12,14 @@ https://github.com/BerriAI/litellm
|
||||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||||
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
||||||
|
|
||||||
## Basic usage
|
## How to use LiteLLM
|
||||||
|
You can use litellm through either:
|
||||||
|
1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
|
||||||
|
2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
|
||||||
|
|
||||||
|
## LiteLLM Python SDK
|
||||||
|
|
||||||
|
### Basic usage
|
||||||
|
|
||||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
|
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
|
@ -146,9 +153,9 @@ response = completion(
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Streaming
|
### Streaming
|
||||||
|
|
||||||
Set `stream=True` in the `completion` args.
|
Set `stream=True` in the `completion` args.
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="openai" label="OpenAI">
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
@ -280,7 +287,7 @@ response = completion(
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Exception handling
|
### Exception handling
|
||||||
|
|
||||||
LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM.
|
LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM.
|
||||||
|
|
||||||
|
@ -296,8 +303,7 @@ except OpenAIError as e:
|
||||||
print(e)
|
print(e)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
|
### Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
|
||||||
|
|
||||||
LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack
|
LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -317,8 +323,7 @@ litellm.success_callback = ["lunary", "langfuse"] # log input/output to lunary,
|
||||||
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||||
```
|
```
|
||||||
|
|
||||||
## Track Costs, Usage, Latency for streaming
|
### Track Costs, Usage, Latency for streaming
|
||||||
|
|
||||||
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -377,14 +382,14 @@ pip install 'litellm[proxy]'
|
||||||
```shell
|
```shell
|
||||||
$ litellm --model huggingface/bigcode/starcoder
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 2: Make ChatCompletions Request to Proxy
|
#### Step 2: Make ChatCompletions Request to Proxy
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import openai # openai v1.0.0+
|
import openai # openai v1.0.0+
|
||||||
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
|
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,5 +1,84 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
# 🔥 Load Test LiteLLM
|
# 🔥 Load Test LiteLLM
|
||||||
|
|
||||||
|
## Load Test LiteLLM Proxy - 1500+ req/s
|
||||||
|
|
||||||
|
## 1500+ concurrent requests/s
|
||||||
|
|
||||||
|
LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
|
||||||
|
|
||||||
|
```python
|
||||||
|
import time, asyncio
|
||||||
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
|
import uuid
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# base_url - litellm proxy endpoint
|
||||||
|
# api_key - litellm proxy api-key, is created proxy with auth
|
||||||
|
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
||||||
|
|
||||||
|
|
||||||
|
async def litellm_completion():
|
||||||
|
# Your existing code for litellm_completion goes here
|
||||||
|
try:
|
||||||
|
response = await litellm_client.chat.completions.create(
|
||||||
|
model="azure-gpt-3.5",
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If there's an exception, log the error message
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
error_log.write(f"Error during completion: {str(e)}\n")
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
for i in range(1):
|
||||||
|
start = time.time()
|
||||||
|
n = 1500 # Number of concurrent tasks
|
||||||
|
tasks = [litellm_completion() for _ in range(n)]
|
||||||
|
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
|
||||||
|
# Write errors to error_log.txt
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
for completion in chat_completions:
|
||||||
|
if isinstance(completion, str):
|
||||||
|
error_log.write(completion + "\n")
|
||||||
|
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Blank out contents of error_log.txt
|
||||||
|
open("error_log.txt", "w").close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Throughput - 30% Increase
|
||||||
|
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
||||||
|
<Image img={require('../img/throughput.png')} />
|
||||||
|
|
||||||
|
### Latency Added - 0.00325 seconds
|
||||||
|
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
||||||
|
<Image img={require('../img/latency.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
### Testing LiteLLM Proxy with Locust
|
||||||
|
- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
|
||||||
|
|
||||||
|
<Image img={require('../img/locust.png')} />
|
||||||
|
|
||||||
|
## Load Test LiteLLM SDK vs OpenAI
|
||||||
Here is a script to load test LiteLLM vs OpenAI
|
Here is a script to load test LiteLLM vs OpenAI
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -11,7 +90,7 @@ import time, asyncio, litellm
|
||||||
#### LITELLM PROXY ####
|
#### LITELLM PROXY ####
|
||||||
litellm_client = AsyncOpenAI(
|
litellm_client = AsyncOpenAI(
|
||||||
api_key="sk-1234", # [CHANGE THIS]
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
#### AZURE OPENAI CLIENT ####
|
#### AZURE OPENAI CLIENT ####
|
||||||
|
@ -85,3 +164,4 @@ async def loadtest_fn():
|
||||||
asyncio.run(loadtest_fn())
|
asyncio.run(loadtest_fn())
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# Anthropic
|
# Anthropic
|
||||||
LiteLLM supports
|
LiteLLM supports
|
||||||
|
|
||||||
|
- `claude-3` (`claude-3-opus-20240229`, `claude-3-sonnet-20240229`)
|
||||||
- `claude-2`
|
- `claude-2`
|
||||||
- `claude-2.1`
|
- `claude-2.1`
|
||||||
- `claude-instant-1`
|
|
||||||
- `claude-instant-1.2`
|
- `claude-instant-1.2`
|
||||||
|
|
||||||
## API Keys
|
## API Keys
|
||||||
|
@ -24,11 +27,217 @@ from litellm import completion
|
||||||
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "Hey! how's it going?"}]
|
messages = [{"role": "user", "content": "Hey! how's it going?"}]
|
||||||
response = completion(model="claude-instant-1", messages=messages)
|
response = completion(model="claude-3-opus-20240229", messages=messages)
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage - "Assistant Pre-fill"
|
|
||||||
|
## Usage - Streaming
|
||||||
|
Just set `stream=True` when calling completion.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
|
messages = [{"role": "user", "content": "Hey! how's it going?"}]
|
||||||
|
response = completion(model="claude-3-opus-20240229", messages=messages, stream=True)
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk["choices"][0]["delta"]["content"]) # same as openai format
|
||||||
|
```
|
||||||
|
|
||||||
|
## OpenAI Proxy Usage
|
||||||
|
|
||||||
|
Here's how to call Anthropic with the LiteLLM Proxy Server
|
||||||
|
|
||||||
|
### 1. Save key in your environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export ANTHROPIC_API_KEY="your-api-key"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start the proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --model claude-3-opus-20240229
|
||||||
|
|
||||||
|
# Server running on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Test it
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
|
||||||
|
model = "gpt-3.5-turbo",
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
|
||||||
|
| Model Name | Function Call |
|
||||||
|
|------------------|--------------------------------------------|
|
||||||
|
| claude-3-opus | `completion('claude-3-opus-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
| claude-3-sonnet | `completion('claude-3-sonnet-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
| claude-2.1 | `completion('claude-2.1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
| claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
| claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
||||||
|
|
||||||
|
## Advanced
|
||||||
|
|
||||||
|
## Usage - Function Calling
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="anthropic/claude-3-opus-20240229",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
# Add any assertions, here to check response args
|
||||||
|
print(response)
|
||||||
|
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
|
||||||
|
assert isinstance(
|
||||||
|
response.choices[0].message.tool_calls[0].function.arguments, str
|
||||||
|
)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Usage - Vision
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
|
def encode_image(image_path):
|
||||||
|
import base64
|
||||||
|
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
image_path = "../proxy/cached_logo.jpg"
|
||||||
|
# Getting the base64 string
|
||||||
|
base64_image = encode_image(image_path)
|
||||||
|
resp = litellm.completion(
|
||||||
|
model="anthropic/claude-3-opus-20240229",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "data:image/jpeg;base64," + base64_image
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
print(f"\nResponse: {resp}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage - "Assistant Pre-fill"
|
||||||
|
|
||||||
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
||||||
|
|
||||||
|
@ -50,7 +259,7 @@ response = completion(model="claude-2.1", messages=messages)
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example prompt sent to Claude
|
#### Example prompt sent to Claude
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -61,7 +270,7 @@ Human: How do you say 'Hello' in German? Return your answer as a JSON object, li
|
||||||
Assistant: {
|
Assistant: {
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage - "System" messages
|
### Usage - "System" messages
|
||||||
If you're using Anthropic's Claude 2.1 with Bedrock, `system` role messages are properly formatted for you.
|
If you're using Anthropic's Claude 2.1 with Bedrock, `system` role messages are properly formatted for you.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -78,7 +287,7 @@ messages = [
|
||||||
response = completion(model="claude-2.1", messages=messages)
|
response = completion(model="claude-2.1", messages=messages)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example prompt sent to Claude
|
#### Example prompt sent to Claude
|
||||||
|
|
||||||
```
|
```
|
||||||
You are a snarky assistant.
|
You are a snarky assistant.
|
||||||
|
@ -88,28 +297,3 @@ Human: How do I boil water?
|
||||||
Assistant:
|
Assistant:
|
||||||
```
|
```
|
||||||
|
|
||||||
## Streaming
|
|
||||||
Just set `stream=True` when calling completion.
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
from litellm import completion
|
|
||||||
|
|
||||||
# set env
|
|
||||||
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "Hey! how's it going?"}]
|
|
||||||
response = completion(model="claude-instant-1", messages=messages, stream=True)
|
|
||||||
for chunk in response:
|
|
||||||
print(chunk["choices"][0]["delta"]["content"]) # same as openai format
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### Model Details
|
|
||||||
|
|
||||||
| Model Name | Function Call | Required OS Variables |
|
|
||||||
|------------------|--------------------------------------------|--------------------------------------|
|
|
||||||
| claude-2.1 | `completion('claude-2.1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
|
||||||
| claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
|
||||||
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
|
||||||
| claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
|
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
api_key, api_base, api_version etc can be passed directly to `litellm.completion` - see here or set as `litellm.api_key` params see here
|
api_key, api_base, api_version etc can be passed directly to `litellm.completion` - see here or set as `litellm.api_key` params see here
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
os.environ["AZURE_API_KEY"] = ""
|
os.environ["AZURE_API_KEY"] = "" # "my-azure-api-key"
|
||||||
os.environ["AZURE_API_BASE"] = ""
|
os.environ["AZURE_API_BASE"] = "" # "https://example-endpoint.openai.azure.com"
|
||||||
os.environ["AZURE_API_VERSION"] = ""
|
os.environ["AZURE_API_VERSION"] = "" # "2023-05-15"
|
||||||
|
|
||||||
# optional
|
# optional
|
||||||
os.environ["AZURE_AD_TOKEN"] = ""
|
os.environ["AZURE_AD_TOKEN"] = ""
|
||||||
|
@ -168,6 +168,13 @@ response = completion(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Azure Instruct Models
|
||||||
|
|
||||||
|
| Model Name | Function Call |
|
||||||
|
|---------------------|----------------------------------------------------|
|
||||||
|
| gpt-3.5-turbo-instruct | `response = completion(model="azure/<your deployment name>", messages=messages)` |
|
||||||
|
| gpt-3.5-turbo-instruct-0914 | `response = completion(model="azure/<your deployment name>", messages=messages)` |
|
||||||
|
|
||||||
|
|
||||||
## Advanced
|
## Advanced
|
||||||
### Azure API Load-Balancing
|
### Azure API Load-Balancing
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# AWS Bedrock
|
# AWS Bedrock
|
||||||
Anthropic, Amazon Titan, A121 LLMs are Supported on Bedrock
|
Anthropic, Amazon Titan, A121 LLMs are Supported on Bedrock
|
||||||
|
|
||||||
|
@ -29,11 +32,193 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
os.environ["AWS_REGION_NAME"] = ""
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
|
|
||||||
response = completion(
|
response = completion(
|
||||||
model="bedrock/anthropic.claude-instant-v1",
|
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## OpenAI Proxy Usage
|
||||||
|
|
||||||
|
Here's how to call Anthropic with the LiteLLM Proxy Server
|
||||||
|
|
||||||
|
### 1. Save key in your environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export AWS_ACCESS_KEY_ID=""
|
||||||
|
export AWS_SECRET_ACCESS_KEY=""
|
||||||
|
export AWS_REGION_NAME=""
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start the proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
|
||||||
|
|
||||||
|
# Server running on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Test it
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
|
||||||
|
model = "gpt-3.5-turbo",
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Usage - Function Calling
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||||
|
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
# Add any assertions, here to check response args
|
||||||
|
print(response)
|
||||||
|
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
|
||||||
|
assert isinstance(
|
||||||
|
response.choices[0].message.tool_calls[0].function.arguments, str
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Usage - Vision
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||||
|
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
|
|
||||||
|
|
||||||
|
def encode_image(image_path):
|
||||||
|
import base64
|
||||||
|
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
image_path = "../proxy/cached_logo.jpg"
|
||||||
|
# Getting the base64 string
|
||||||
|
base64_image = encode_image(image_path)
|
||||||
|
resp = litellm.completion(
|
||||||
|
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "data:image/jpeg;base64," + base64_image
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
print(f"\nResponse: {resp}")
|
||||||
|
```
|
||||||
|
|
||||||
## Usage - "Assistant Pre-fill"
|
## Usage - "Assistant Pre-fill"
|
||||||
|
|
||||||
If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
||||||
|
@ -287,7 +472,8 @@ response = litellm.embedding(
|
||||||
Here's an example of using a bedrock model with LiteLLM
|
Here's an example of using a bedrock model with LiteLLM
|
||||||
|
|
||||||
| Model Name | Command |
|
| Model Name | Command |
|
||||||
|--------------------------|------------------------------------------------------------------|
|
|----------------------------|------------------------------------------------------------------|
|
||||||
|
| Anthropic Claude-V3 | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
||||||
| Anthropic Claude-V2.1 | `completion(model='bedrock/anthropic.claude-v2:1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
| Anthropic Claude-V2.1 | `completion(model='bedrock/anthropic.claude-v2:1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
||||||
| Anthropic Claude-V2 | `completion(model='bedrock/anthropic.claude-v2', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
| Anthropic Claude-V2 | `completion(model='bedrock/anthropic.claude-v2', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
||||||
| Anthropic Claude-Instant V1 | `completion(model='bedrock/anthropic.claude-instant-v1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
| Anthropic Claude-Instant V1 | `completion(model='bedrock/anthropic.claude-instant-v1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']` |
|
||||||
|
@ -298,6 +484,8 @@ Here's an example of using a bedrock model with LiteLLM
|
||||||
| AI21 J2-Ultra | `completion(model='bedrock/ai21.j2-ultra-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
| AI21 J2-Ultra | `completion(model='bedrock/ai21.j2-ultra-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
||||||
| Meta Llama 2 Chat 13b | `completion(model='bedrock/meta.llama2-13b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
| Meta Llama 2 Chat 13b | `completion(model='bedrock/meta.llama2-13b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
||||||
| Meta Llama 2 Chat 70b | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
| Meta Llama 2 Chat 70b | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
||||||
|
| Mistral 7B Instruct | `completion(model='bedrock/mistral.mistral-7b-instruct-v0:2', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
||||||
|
| Mixtral 8x7B Instruct | `completion(model='bedrock/mistral.mixtral-8x7b-instruct-v0:1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
|
||||||
|
|
||||||
## Bedrock Embedding
|
## Bedrock Embedding
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,12 @@ LiteLLM supports all models from [Ollama](https://github.com/jmorganca/ollama)
|
||||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
We recommend using [ollama_chat](#using-ollama-apichat) for better responses.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
## Pre-requisites
|
## Pre-requisites
|
||||||
Ensure you have your ollama server running
|
Ensure you have your ollama server running
|
||||||
|
|
||||||
|
@ -177,7 +183,7 @@ On the docker container run the `test.py` file using `python3 test.py`
|
||||||
```python
|
```python
|
||||||
import openai
|
import openai
|
||||||
|
|
||||||
api_base = f"http://0.0.0.0:8000" # base url for server
|
api_base = f"http://0.0.0.0:4000" # base url for server
|
||||||
|
|
||||||
openai.api_base = api_base
|
openai.api_base = api_base
|
||||||
openai.api_key = "temp-key"
|
openai.api_key = "temp-key"
|
||||||
|
|
|
@ -93,6 +93,7 @@ response = completion(
|
||||||
| Model Name | Function Call |
|
| Model Name | Function Call |
|
||||||
|---------------------|----------------------------------------------------|
|
|---------------------|----------------------------------------------------|
|
||||||
| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
|
| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
|
||||||
|
| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-091", messages=messages)` |
|
||||||
| text-davinci-003 | `response = completion(model="text-davinci-003", messages=messages)` |
|
| text-davinci-003 | `response = completion(model="text-davinci-003", messages=messages)` |
|
||||||
| ada-001 | `response = completion(model="ada-001", messages=messages)` |
|
| ada-001 | `response = completion(model="ada-001", messages=messages)` |
|
||||||
| curie-001 | `response = completion(model="curie-001", messages=messages)` |
|
| curie-001 | `response = completion(model="curie-001", messages=messages)` |
|
||||||
|
|
|
@ -15,7 +15,7 @@ import os
|
||||||
response = litellm.completion(
|
response = litellm.completion(
|
||||||
model="openai/mistral, # add `openai/` prefix to model so litellm knows to route to OpenAI
|
model="openai/mistral, # add `openai/` prefix to model so litellm knows to route to OpenAI
|
||||||
api_key="sk-1234", # api key to your openai compatible endpoint
|
api_key="sk-1234", # api key to your openai compatible endpoint
|
||||||
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
|
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -35,7 +35,7 @@ import os
|
||||||
response = litellm.embedding(
|
response = litellm.embedding(
|
||||||
model="openai/GPT-J", # add `openai/` prefix to model so litellm knows to route to OpenAI
|
model="openai/GPT-J", # add `openai/` prefix to model so litellm knows to route to OpenAI
|
||||||
api_key="sk-1234", # api key to your openai compatible endpoint
|
api_key="sk-1234", # api key to your openai compatible endpoint
|
||||||
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
|
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
|
||||||
input=["good morning from litellm"]
|
input=["good morning from litellm"]
|
||||||
)
|
)
|
||||||
print(response)
|
print(response)
|
||||||
|
|
|
@ -33,12 +33,16 @@ general_settings:
|
||||||
alerting: ["slack"]
|
alerting: ["slack"]
|
||||||
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
|
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
|
||||||
|
|
||||||
environment_variables:
|
```
|
||||||
|
|
||||||
|
Set `SLACK_WEBHOOK_URL` in your proxy env
|
||||||
|
|
||||||
|
```shell
|
||||||
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
|
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 3: Start proxy
|
### Step 3: Start proxy
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ litellm /path/to/config.yaml
|
$ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
61
docs/my-website/docs/proxy/budget_alerts.md
Normal file
61
docs/my-website/docs/proxy/budget_alerts.md
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
|
# 🚨 Budget Alerting
|
||||||
|
|
||||||
|
**Alerts when a project will exceed it’s planned limit**
|
||||||
|
|
||||||
|
<Image img={require('../../img/budget_alerts.png')} />
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Setup Slack Alerting on your Proxy Config.yaml
|
||||||
|
|
||||||
|
**Add Slack Webhook to your env**
|
||||||
|
Get a slack webhook url from https://api.slack.com/messaging/webhooks
|
||||||
|
|
||||||
|
|
||||||
|
Set `SLACK_WEBHOOK_URL` in your proxy env
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Update proxy config.yaml with slack alerting**
|
||||||
|
|
||||||
|
Add `general_settings:alerting`
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
model_name: "azure-model"
|
||||||
|
litellm_params:
|
||||||
|
model: "azure/gpt-35-turbo"
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
alerting: ["slack"]
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Start proxy
|
||||||
|
```bash
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### 2. Create API Key on Proxy Admin UI
|
||||||
|
The Admin UI is found on `your-litellm-proxy-endpoint/ui`, example `http://localhost:4000/ui/`
|
||||||
|
|
||||||
|
- Set a key name
|
||||||
|
- Set a Soft Budget on when to get alerted
|
||||||
|
|
||||||
|
<Image img={require('../../img/create_key.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
### 3. Test Slack Alerting on Admin UI
|
||||||
|
After creating a key on the Admin UI, click on "Test Slack Alert" to send a test alert to your Slack channel
|
||||||
|
<Image img={require('../../img/test_alert.png')} />
|
||||||
|
|
||||||
|
### 4. Check Slack
|
||||||
|
|
||||||
|
When the test alert works, you should expect to see this on your alerts slack channel
|
||||||
|
|
||||||
|
<Image img={require('../../img/budget_alerts.png')} />
|
|
@ -145,7 +145,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
Send the same request twice:
|
Send the same request twice:
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -153,7 +153,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
"temperature": 0.7
|
"temperature": 0.7
|
||||||
}'
|
}'
|
||||||
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -166,14 +166,14 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
|
||||||
Send the same request twice:
|
Send the same request twice:
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
curl --location 'http://0.0.0.0:4000/embeddings' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "text-embedding-ada-002",
|
"model": "text-embedding-ada-002",
|
||||||
"input": ["write a litellm poem"]
|
"input": ["write a litellm poem"]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
curl --location 'http://0.0.0.0:4000/embeddings' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "text-embedding-ada-002",
|
"model": "text-embedding-ada-002",
|
||||||
|
@ -227,7 +227,7 @@ from openai import OpenAI
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
# This is the default and can be omitted
|
# This is the default and can be omitted
|
||||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
|
@ -255,7 +255,7 @@ from openai import OpenAI
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
# This is the default and can be omitted
|
# This is the default and can be omitted
|
||||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
|
@ -281,7 +281,7 @@ from openai import OpenAI
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
# This is the default and can be omitted
|
# This is the default and can be omitted
|
||||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
|
|
|
@ -63,7 +63,7 @@ litellm_settings:
|
||||||
$ litellm /path/to/config.yaml
|
$ litellm /path/to/config.yaml
|
||||||
```
|
```
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
@ -162,7 +162,7 @@ litellm_settings:
|
||||||
$ litellm /path/to/config.yaml
|
$ litellm /path/to/config.yaml
|
||||||
```
|
```
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
|
@ -15,7 +15,7 @@ Cli arguments, --host, --port, --num_workers
|
||||||
```
|
```
|
||||||
|
|
||||||
## --port
|
## --port
|
||||||
- **Default:** `8000`
|
- **Default:** `4000`
|
||||||
- The port to bind the server to.
|
- The port to bind the server to.
|
||||||
- **Usage:**
|
- **Usage:**
|
||||||
```shell
|
```shell
|
||||||
|
|
|
@ -13,7 +13,7 @@ Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`m
|
||||||
| `general_settings` | Server settings, example setting `master_key: sk-my_special_key` |
|
| `general_settings` | Server settings, example setting `master_key: sk-my_special_key` |
|
||||||
| `environment_variables` | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` |
|
| `environment_variables` | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` |
|
||||||
|
|
||||||
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:8000/#/config.yaml), for everything you can pass in the config.yaml.
|
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:4000/#/config.yaml), for everything you can pass in the config.yaml.
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
@ -49,13 +49,13 @@ model_list:
|
||||||
rpm: 6
|
rpm: 6
|
||||||
- model_name: anthropic-claude
|
- model_name: anthropic-claude
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model="bedrock/anthropic.claude-instant-v1"
|
model: bedrock/anthropic.claude-instant-v1
|
||||||
### [OPTIONAL] SET AWS REGION ###
|
### [OPTIONAL] SET AWS REGION ###
|
||||||
aws_region_name="us-east-1"
|
aws_region_name: us-east-1
|
||||||
- model_name: vllm-models
|
- model_name: vllm-models
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||||
api_base: http://0.0.0.0:8000
|
api_base: http://0.0.0.0:4000
|
||||||
rpm: 1440
|
rpm: 1440
|
||||||
model_info:
|
model_info:
|
||||||
version: 2
|
version: 2
|
||||||
|
@ -91,7 +91,7 @@ Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
|
||||||
If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing)
|
If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing)
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -111,7 +111,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml
|
Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "bedrock-claude-v1",
|
"model": "bedrock-claude-v1",
|
||||||
|
@ -131,7 +131,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
|
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
|
||||||
|
@ -179,7 +179,7 @@ messages = [
|
||||||
|
|
||||||
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
|
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
|
||||||
chat = ChatOpenAI(
|
chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
|
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
|
||||||
model = "gpt-3.5-turbo",
|
model = "gpt-3.5-turbo",
|
||||||
temperature=0.1
|
temperature=0.1
|
||||||
)
|
)
|
||||||
|
@ -189,7 +189,7 @@ print(response)
|
||||||
|
|
||||||
# Sends request to model where `model_name=bedrock-claude-v1` on config.yaml.
|
# Sends request to model where `model_name=bedrock-claude-v1` on config.yaml.
|
||||||
claude_chat = ChatOpenAI(
|
claude_chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
|
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
|
||||||
model = "bedrock-claude-v1",
|
model = "bedrock-claude-v1",
|
||||||
temperature=0.1
|
temperature=0.1
|
||||||
)
|
)
|
||||||
|
@ -202,7 +202,7 @@ print(response)
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
|
||||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||||
|
|
||||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||||
|
@ -244,6 +244,68 @@ $ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Load Balancing
|
||||||
|
|
||||||
|
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
|
||||||
|
|
||||||
|
For optimal performance:
|
||||||
|
- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
|
||||||
|
- Select your optimal routing strategy in `router_settings:routing_strategy`.
|
||||||
|
|
||||||
|
LiteLLM supports
|
||||||
|
```python
|
||||||
|
["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
|
||||||
|
```
|
||||||
|
|
||||||
|
When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
|
||||||
|
- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8001
|
||||||
|
rpm: 60 # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
|
||||||
|
tpm: 1000 # Optional[int]: tpm = Tokens Per Minute
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8002
|
||||||
|
rpm: 600
|
||||||
|
- model_name: zephyr-beta
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/HuggingFaceH4/zephyr-7b-beta
|
||||||
|
api_base: http://0.0.0.0:8003
|
||||||
|
rpm: 60000
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo
|
||||||
|
api_key: <my-openai-key>
|
||||||
|
rpm: 200
|
||||||
|
- model_name: gpt-3.5-turbo-16k
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo-16k
|
||||||
|
api_key: <my-openai-key>
|
||||||
|
rpm: 100
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||||
|
request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
|
||||||
|
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||||
|
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||||
|
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||||
|
|
||||||
|
router_settings: # router_settings are optional
|
||||||
|
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
|
||||||
|
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
|
||||||
|
num_retries: 2
|
||||||
|
timeout: 30 # 30 seconds
|
||||||
|
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
|
||||||
|
redis_password: <your redis password>
|
||||||
|
redis_port: 1992
|
||||||
|
```
|
||||||
|
|
||||||
## Set Azure `base_model` for cost tracking
|
## Set Azure `base_model` for cost tracking
|
||||||
|
|
||||||
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
||||||
|
@ -498,7 +560,7 @@ litellm --config config.yaml
|
||||||
Sends Request to `bedrock-cohere`
|
Sends Request to `bedrock-cohere`
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "bedrock-cohere",
|
"model": "bedrock-cohere",
|
||||||
|
@ -512,30 +574,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Router Settings
|
|
||||||
|
|
||||||
Use this to configure things like routing strategy.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
router_settings:
|
|
||||||
routing_strategy: "least-busy"
|
|
||||||
|
|
||||||
model_list: # will route requests to the least busy ollama model
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/mistral"
|
|
||||||
api_base: "http://127.0.0.1:8001"
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/codellama"
|
|
||||||
api_base: "http://127.0.0.1:8002"
|
|
||||||
- model_name: ollama-models
|
|
||||||
litellm_params:
|
|
||||||
model: "ollama/llama2"
|
|
||||||
api_base: "http://127.0.0.1:8003"
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Configure DB Pool Limits + Connection Timeouts
|
## Configure DB Pool Limits + Connection Timeouts
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -28,7 +28,7 @@ docker run ghcr.io/berriai/litellm:main-latest
|
||||||
|
|
||||||
<TabItem value="cli" label="With CLI Args">
|
<TabItem value="cli" label="With CLI Args">
|
||||||
|
|
||||||
### Run with LiteLLM CLI args
|
#### Run with LiteLLM CLI args
|
||||||
|
|
||||||
See all supported CLI args [here](https://docs.litellm.ai/docs/proxy/cli):
|
See all supported CLI args [here](https://docs.litellm.ai/docs/proxy/cli):
|
||||||
|
|
||||||
|
@ -68,8 +68,87 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="kubernetes" label="Kubernetes">
|
||||||
|
|
||||||
|
Deploying a config file based litellm instance just requires a simple deployment that loads
|
||||||
|
the config.yaml file via a config map. Also it would be a good practice to use the env var
|
||||||
|
declaration for api keys, and attach the env vars with the api key values as an opaque secret.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: litellm-config-file
|
||||||
|
data:
|
||||||
|
config.yaml: |
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key: os.environ/CA_AZURE_OPENAI_API_KEY
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
type: Opaque
|
||||||
|
metadata:
|
||||||
|
name: litellm-secrets
|
||||||
|
data:
|
||||||
|
CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: litellm-deployment
|
||||||
|
labels:
|
||||||
|
app: litellm
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: litellm
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: litellm
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: litellm
|
||||||
|
image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
|
||||||
|
ports:
|
||||||
|
- containerPort: 4000
|
||||||
|
volumeMounts:
|
||||||
|
- name: config-volume
|
||||||
|
mountPath: /app/proxy_server_config.yaml
|
||||||
|
subPath: config.yaml
|
||||||
|
envFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: litellm-secrets
|
||||||
|
volumes:
|
||||||
|
- name: config-volume
|
||||||
|
configMap:
|
||||||
|
name: litellm-config-file
|
||||||
|
```
|
||||||
|
|
||||||
|
:::info
|
||||||
|
To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
|
||||||
|
:::
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
**That's it ! That's the quick start to deploy litellm**
|
||||||
|
|
||||||
|
## Options to deploy LiteLLM
|
||||||
|
|
||||||
|
| Docs | When to Use |
|
||||||
|
| --- | --- |
|
||||||
|
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
|
||||||
|
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
|
||||||
|
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
|
||||||
|
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
|
||||||
|
|
||||||
|
|
||||||
## Deploy with Database
|
## Deploy with Database
|
||||||
|
|
||||||
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
|
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
|
||||||
|
@ -93,7 +172,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="kubernetes-deploy" label="Kubernetes">
|
<TabItem value="kubernetes-deploy" label="Kubernetes">
|
||||||
|
|
||||||
### Step 1. Create deployment.yaml
|
#### Step 1. Create deployment.yaml
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
|
@ -122,7 +201,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
kubectl apply -f /path/to/deployment.yaml
|
kubectl apply -f /path/to/deployment.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2. Create service.yaml
|
#### Step 2. Create service.yaml
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
|
@ -143,7 +222,7 @@ spec:
|
||||||
kubectl apply -f /path/to/service.yaml
|
kubectl apply -f /path/to/service.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 3. Start server
|
#### Step 3. Start server
|
||||||
|
|
||||||
```
|
```
|
||||||
kubectl port-forward service/litellm-service 4000:4000
|
kubectl port-forward service/litellm-service 4000:4000
|
||||||
|
@ -154,13 +233,13 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="helm-deploy" label="Helm">
|
<TabItem value="helm-deploy" label="Helm">
|
||||||
|
|
||||||
### Step 1. Clone the repository
|
#### Step 1. Clone the repository
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/BerriAI/litellm.git
|
git clone https://github.com/BerriAI/litellm.git
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2. Deploy with Helm
|
#### Step 2. Deploy with Helm
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
helm install \
|
helm install \
|
||||||
|
@ -169,20 +248,91 @@ helm install \
|
||||||
deploy/charts/litellm
|
deploy/charts/litellm
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 3. Expose the service to localhost
|
#### Step 3. Expose the service to localhost
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
kubectl \
|
kubectl \
|
||||||
port-forward \
|
port-forward \
|
||||||
service/mydeploy-litellm \
|
service/mydeploy-litellm \
|
||||||
8000:8000
|
4000:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
Your OpenAI proxy server is now running on `http://127.0.0.1:8000`.
|
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
## LiteLLM container + Redis
|
||||||
|
Use Redis when you need litellm to load balance across multiple litellm containers
|
||||||
|
|
||||||
|
The only change required is setting Redis on your `config.yaml`
|
||||||
|
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/<your-deployment-name>
|
||||||
|
api_base: <your-azure-endpoint>
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6
|
||||||
|
router_settings:
|
||||||
|
redis_host: <your redis host>
|
||||||
|
redis_password: <your redis password>
|
||||||
|
redis_port: 1992
|
||||||
|
```
|
||||||
|
|
||||||
|
Start docker container with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run ghcr.io/berriai/litellm:main-latest --config your_config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## LiteLLM Database container + PostgresDB + Redis
|
||||||
|
|
||||||
|
The only change required is setting Redis on your `config.yaml`
|
||||||
|
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
|
||||||
|
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/<your-deployment-name>
|
||||||
|
api_base: <your-azure-endpoint>
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key: <your-azure-api-key>
|
||||||
|
rpm: 6
|
||||||
|
router_settings:
|
||||||
|
redis_host: <your redis host>
|
||||||
|
redis_password: <your redis password>
|
||||||
|
redis_port: 1992
|
||||||
|
```
|
||||||
|
|
||||||
|
Start `litellm-database`docker container with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run --name litellm-proxy \
|
||||||
|
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
|
||||||
|
-p 4000:4000 \
|
||||||
|
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices for Deploying to Production
|
||||||
|
### 1. Switch of debug logs in production
|
||||||
|
don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
|
||||||
|
|
||||||
## Advanced Deployment Settings
|
## Advanced Deployment Settings
|
||||||
|
|
||||||
### Customization of the server root path
|
### Customization of the server root path
|
||||||
|
@ -214,8 +364,49 @@ Provide an ssl certificate when starting litellm proxy server
|
||||||
|
|
||||||
## Platform-specific Guide
|
## Platform-specific Guide
|
||||||
|
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
|
|
||||||
|
<TabItem value="aws-stack" label="AWS Cloud Formation Stack">
|
||||||
|
|
||||||
|
### AWS Cloud Formation Stack
|
||||||
|
LiteLLM AWS Cloudformation Stack - **Get the best LiteLLM AutoScaling Policy and Provision the DB for LiteLLM Proxy**
|
||||||
|
|
||||||
|
This will provision:
|
||||||
|
- LiteLLMServer - EC2 Instance
|
||||||
|
- LiteLLMServerAutoScalingGroup
|
||||||
|
- LiteLLMServerScalingPolicy (autoscaling policy)
|
||||||
|
- LiteLLMDB - RDS::DBInstance
|
||||||
|
|
||||||
|
#### Using AWS Cloud Formation Stack
|
||||||
|
**LiteLLM Cloudformation stack is located [here - litellm.yaml](https://github.com/BerriAI/litellm/blob/main/enterprise/cloudformation_stack/litellm.yaml)**
|
||||||
|
|
||||||
|
#### 1. Create the CloudFormation Stack:
|
||||||
|
In the AWS Management Console, navigate to the CloudFormation service, and click on "Create Stack."
|
||||||
|
|
||||||
|
On the "Create Stack" page, select "Upload a template file" and choose the litellm.yaml file
|
||||||
|
|
||||||
|
Now monitor the stack was created successfully.
|
||||||
|
|
||||||
|
#### 2. Get the Database URL:
|
||||||
|
Once the stack is created, get the DatabaseURL of the Database resource, copy this value
|
||||||
|
|
||||||
|
#### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
|
||||||
|
From the EC2 console, connect to the instance created by the stack (e.g., using SSH).
|
||||||
|
|
||||||
|
Run the following command, replacing <database_url> with the value you copied in step 2
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run --name litellm-proxy \
|
||||||
|
-e DATABASE_URL=<database_url> \
|
||||||
|
-p 4000:4000 \
|
||||||
|
ghcr.io/berriai/litellm-database:main-latest
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. Access the Application:
|
||||||
|
|
||||||
|
Once the container is running, you can access the application by going to `http://<ec2-public-ip>:4000` in your browser.
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
<TabItem value="google-cloud-run" label="Google Cloud Run">
|
<TabItem value="google-cloud-run" label="Google Cloud Run">
|
||||||
|
|
||||||
### Deploy on Google Cloud Run
|
### Deploy on Google Cloud Run
|
||||||
|
@ -269,9 +460,7 @@ curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
||||||
|
|
||||||
**Step 1**
|
**Step 1**
|
||||||
|
|
||||||
- (Recommended) Use the example file `docker-compose.example.yml` given in the project root. e.g. https://github.com/BerriAI/litellm/blob/main/docker-compose.example.yml
|
- (Recommended) Use the example file `docker-compose.yml` given in the project root. e.g. https://github.com/BerriAI/litellm/blob/main/docker-compose.yml
|
||||||
|
|
||||||
- Rename the file `docker-compose.example.yml` to `docker-compose.yml`.
|
|
||||||
|
|
||||||
Here's an example `docker-compose.yml` file
|
Here's an example `docker-compose.yml` file
|
||||||
```yaml
|
```yaml
|
||||||
|
@ -284,11 +473,11 @@ services:
|
||||||
target: runtime
|
target: runtime
|
||||||
image: ghcr.io/berriai/litellm:main-latest
|
image: ghcr.io/berriai/litellm:main-latest
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000" # Map the container port to the host, change the host port if necessary
|
- "4000:4000" # Map the container port to the host, change the host port if necessary
|
||||||
volumes:
|
volumes:
|
||||||
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
||||||
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
||||||
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
|
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
|
||||||
|
|
||||||
# ...rest of your docker-compose config if any
|
# ...rest of your docker-compose config if any
|
||||||
```
|
```
|
||||||
|
@ -306,18 +495,4 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
|
||||||
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
|
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
|
||||||
|
|
||||||
|
|
||||||
Your LiteLLM container should be running now on the defined port e.g. `8000`.
|
Your LiteLLM container should be running now on the defined port e.g. `4000`.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## LiteLLM Proxy Performance
|
|
||||||
|
|
||||||
LiteLLM proxy has been load tested to handle 1500 req/s.
|
|
||||||
|
|
||||||
### Throughput - 30% Increase
|
|
||||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
|
||||||
<Image img={require('../../img/throughput.png')} />
|
|
||||||
|
|
||||||
### Latency Added - 0.00325 seconds
|
|
||||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
|
||||||
<Image img={require('../../img/latency.png')} />
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
3. Test the embedding call
|
3. Test the embedding call
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/v1/embeddings' \
|
curl --location 'http://0.0.0.0:4000/v1/embeddings' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
|
|
|
@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
|
||||||
:::
|
:::
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
- [ ] Content Moderation with LlamaGuard
|
- ✅ Content Moderation with LlamaGuard
|
||||||
- [ ] Content Moderation with Google Text Moderations
|
- ✅ Content Moderation with Google Text Moderations
|
||||||
- [ ] Content Moderation with LLM Guard
|
- ✅ Content Moderation with LLM Guard
|
||||||
- [ ] Reject calls from Blocked User list
|
- ✅ Reject calls from Blocked User list
|
||||||
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
||||||
- [ ] Tracking Spend for Custom Tags
|
- ✅ Don't log/store specific requests (eg confidential LLM requests)
|
||||||
|
- ✅ Tracking Spend for Custom Tags
|
||||||
|
|
||||||
## Content Moderation with LlamaGuard
|
## Content Moderation
|
||||||
|
### Content Moderation with LlamaGuard
|
||||||
|
|
||||||
Currently works with Sagemaker's LlamaGuard endpoint.
|
Currently works with Sagemaker's LlamaGuard endpoint.
|
||||||
|
|
||||||
|
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
os.environ["AWS_REGION_NAME"] = ""
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
```
|
```
|
||||||
|
|
||||||
### Customize LlamaGuard prompt
|
#### Customize LlamaGuard prompt
|
||||||
|
|
||||||
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
|
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
|
||||||
|
|
||||||
|
@ -51,12 +53,12 @@ callbacks: ["llamaguard_moderations"]
|
||||||
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## Content Moderation with LLM Guard
|
### Content Moderation with LLM Guard
|
||||||
|
|
||||||
Set the LLM Guard API Base in your environment
|
Set the LLM Guard API Base in your environment
|
||||||
|
|
||||||
```env
|
```env
|
||||||
LLM_GUARD_API_BASE = "http://0.0.0.0:8000"
|
LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
|
||||||
```
|
```
|
||||||
|
|
||||||
Add `llmguard_moderations` as a callback
|
Add `llmguard_moderations` as a callback
|
||||||
|
@ -78,7 +80,7 @@ Expected results:
|
||||||
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Content Moderation with Google Text Moderation
|
### Content Moderation with Google Text Moderation
|
||||||
|
|
||||||
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
|
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
|
||||||
|
|
||||||
|
@ -89,7 +91,7 @@ litellm_settings:
|
||||||
callbacks: ["google_text_moderation"]
|
callbacks: ["google_text_moderation"]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Set custom confidence thresholds
|
#### Set custom confidence thresholds
|
||||||
|
|
||||||
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
|
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
|
||||||
|
|
||||||
|
@ -133,6 +135,33 @@ Here are the category specific values:
|
||||||
| "legal" | legal_threshold: 0.1 |
|
| "legal" | legal_threshold: 0.1 |
|
||||||
|
|
||||||
|
|
||||||
|
## Incognito Requests - Don't log anything
|
||||||
|
|
||||||
|
When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything", # proxy api-key
|
||||||
|
base_url="http://0.0.0.0:4000" # litellm proxy
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
extra_body={
|
||||||
|
"no-log": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Enable Blocked User Lists
|
## Enable Blocked User Lists
|
||||||
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
|
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
|
||||||
|
@ -146,7 +175,7 @@ litellm_settings:
|
||||||
### How to test
|
### How to test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -173,7 +202,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
**Block all calls for a user id**
|
**Block all calls for a user id**
|
||||||
|
|
||||||
```
|
```
|
||||||
curl -X POST "http://0.0.0.0:8000/user/block" \
|
curl -X POST "http://0.0.0.0:4000/user/block" \
|
||||||
-H "Authorization: Bearer sk-1234" \
|
-H "Authorization: Bearer sk-1234" \
|
||||||
-D '{
|
-D '{
|
||||||
"user_ids": [<user_id>, ...]
|
"user_ids": [<user_id>, ...]
|
||||||
|
@ -183,7 +212,7 @@ curl -X POST "http://0.0.0.0:8000/user/block" \
|
||||||
**Unblock calls for a user id**
|
**Unblock calls for a user id**
|
||||||
|
|
||||||
```
|
```
|
||||||
curl -X POST "http://0.0.0.0:8000/user/unblock" \
|
curl -X POST "http://0.0.0.0:4000/user/unblock" \
|
||||||
-H "Authorization: Bearer sk-1234" \
|
-H "Authorization: Bearer sk-1234" \
|
||||||
-D '{
|
-D '{
|
||||||
"user_ids": [<user_id>, ...]
|
"user_ids": [<user_id>, ...]
|
||||||
|
@ -201,7 +230,7 @@ litellm_settings:
|
||||||
### Test this
|
### Test this
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -234,7 +263,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -262,7 +291,7 @@ print(response)
|
||||||
Pass `metadata` as part of the request body
|
Pass `metadata` as part of the request body
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -288,7 +317,7 @@ from langchain.prompts.chat import (
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000",
|
openai_api_base="http://0.0.0.0:4000",
|
||||||
model = "gpt-3.5-turbo",
|
model = "gpt-3.5-turbo",
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
extra_body={
|
extra_body={
|
||||||
|
|
|
@ -12,10 +12,10 @@ The proxy exposes:
|
||||||
#### Request
|
#### Request
|
||||||
Make a GET Request to `/health` on the proxy
|
Make a GET Request to `/health` on the proxy
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/health' -H "Authorization: Bearer sk-1234"
|
curl --location 'http://0.0.0.0:4000/health' -H "Authorization: Bearer sk-1234"
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
|
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
|
||||||
```
|
```
|
||||||
litellm --health
|
litellm --health
|
||||||
```
|
```
|
||||||
|
@ -60,7 +60,7 @@ $ litellm /path/to/config.yaml
|
||||||
|
|
||||||
3. Query health endpoint:
|
3. Query health endpoint:
|
||||||
```
|
```
|
||||||
curl --location 'http://0.0.0.0:8000/health'
|
curl --location 'http://0.0.0.0:4000/health'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Embedding Models
|
### Embedding Models
|
||||||
|
@ -119,7 +119,7 @@ Unprotected endpoint for checking if proxy is ready to accept requests
|
||||||
Example Request:
|
Example Request:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/health/readiness'
|
curl --location 'http://0.0.0.0:4000/health/readiness'
|
||||||
```
|
```
|
||||||
|
|
||||||
Example Response:
|
Example Response:
|
||||||
|
@ -153,7 +153,7 @@ Example Request:
|
||||||
|
|
||||||
```
|
```
|
||||||
curl -X 'GET' \
|
curl -X 'GET' \
|
||||||
'http://0.0.0.0:8000/health/liveliness' \
|
'http://0.0.0.0:4000/health/liveliness' \
|
||||||
-H 'accept: application/json'
|
-H 'accept: application/json'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
### Step 3: Use proxy - Call a model group [Load Balancing]
|
### Step 3: Use proxy - Call a model group [Load Balancing]
|
||||||
Curl Command
|
Curl Command
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -65,7 +65,7 @@ If you want to call a specific model defined in the `config.yaml`, you can call
|
||||||
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
|
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "azure/gpt-turbo-small-ca",
|
"model": "azure/gpt-turbo-small-ca",
|
||||||
|
|
|
@ -150,7 +150,7 @@ litellm --config proxy_config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -174,7 +174,7 @@ On Success
|
||||||
Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21},
|
Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21},
|
||||||
Cost: 3.65e-05,
|
Cost: 3.65e-05,
|
||||||
Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}}
|
Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}}
|
||||||
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:8000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
|
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:4000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Logging Proxy Request Object, Header, Url
|
#### Logging Proxy Request Object, Header, Url
|
||||||
|
@ -374,7 +374,7 @@ async def log_event(request: Request):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run(app, host="127.0.0.1", port=8000)
|
uvicorn.run(app, host="127.0.0.1", port=4000)
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -383,7 +383,7 @@ if __name__ == "__main__":
|
||||||
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
|
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:8000/log-event"
|
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
|
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
|
||||||
|
@ -445,7 +445,7 @@ Expected output on Langfuse
|
||||||
Pass `metadata` as part of the request body
|
Pass `metadata` as part of the request body
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -472,7 +472,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -509,7 +509,7 @@ from langchain.prompts.chat import (
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000",
|
openai_api_base="http://0.0.0.0:4000",
|
||||||
model = "gpt-3.5-turbo",
|
model = "gpt-3.5-turbo",
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
extra_body={
|
extra_body={
|
||||||
|
@ -663,7 +663,7 @@ litellm --config config.yaml --debug
|
||||||
|
|
||||||
Test Request
|
Test Request
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "Azure OpenAI GPT-4 East",
|
"model": "Azure OpenAI GPT-4 East",
|
||||||
|
@ -698,7 +698,7 @@ litellm_settings:
|
||||||
Now, when you [generate keys](./virtual_keys.md) for this team-id
|
Now, when you [generate keys](./virtual_keys.md) for this team-id
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST 'http://0.0.0.0:8000/key/generate' \
|
curl -X POST 'http://0.0.0.0:4000/key/generate' \
|
||||||
-H 'Authorization: Bearer sk-1234' \
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-D '{"team_id": "ishaans-secret-project"}'
|
-D '{"team_id": "ishaans-secret-project"}'
|
||||||
|
@ -742,7 +742,7 @@ litellm --config config.yaml --debug
|
||||||
|
|
||||||
Test Request
|
Test Request
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "Azure OpenAI GPT-4 East",
|
"model": "Azure OpenAI GPT-4 East",
|
||||||
|
@ -903,7 +903,7 @@ litellm --config config.yaml --debug
|
||||||
|
|
||||||
Test Request
|
Test Request
|
||||||
```
|
```
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -947,7 +947,7 @@ litellm --config config.yaml --debug
|
||||||
|
|
||||||
Test Request
|
Test Request
|
||||||
```
|
```
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
|
|
@ -24,7 +24,7 @@ Retrieve detailed information about each model listed in the `/models` endpoint,
|
||||||
<TabItem value="curl">
|
<TabItem value="curl">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X GET "http://0.0.0.0:8000/model/info" \
|
curl -X GET "http://0.0.0.0:4000/model/info" \
|
||||||
-H "accept: application/json" \
|
-H "accept: application/json" \
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
@ -42,7 +42,7 @@ Add a new model to the list in the `config.yaml` by providing the model paramete
|
||||||
<TabItem value="curl">
|
<TabItem value="curl">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST "http://0.0.0.0:8000/model/new" \
|
curl -X POST "http://0.0.0.0:4000/model/new" \
|
||||||
-H "accept: application/json" \
|
-H "accept: application/json" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
|
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
|
||||||
|
|
|
@ -96,7 +96,7 @@ Turn off PII masking for a given key.
|
||||||
Do this by setting `permissions: {"pii": false}`, when generating a key.
|
Do this by setting `permissions: {"pii": false}`, when generating a key.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
|
@ -119,7 +119,7 @@ The proxy support 2 request-level PII controls:
|
||||||
Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls.
|
Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer my-master-key' \
|
--header 'Authorization: Bearer my-master-key' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
|
@ -136,7 +136,7 @@ from openai import OpenAI
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
# This is the default and can be omitted
|
# This is the default and can be omitted
|
||||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
|
|
|
@ -21,7 +21,7 @@ Run the following command to start the litellm proxy
|
||||||
```shell
|
```shell
|
||||||
$ litellm --model huggingface/bigcode/starcoder
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
### Test
|
### Test
|
||||||
|
@ -250,7 +250,7 @@ litellm --config your_config.yaml
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -270,7 +270,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -297,7 +297,7 @@ from langchain.prompts.chat import (
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
|
||||||
model = "gpt-3.5-turbo",
|
model = "gpt-3.5-turbo",
|
||||||
temperature=0.1
|
temperature=0.1
|
||||||
)
|
)
|
||||||
|
@ -321,7 +321,7 @@ print(response)
|
||||||
```python
|
```python
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
@ -331,7 +331,7 @@ query_result = embeddings.embed_query(text)
|
||||||
print(f"SAGEMAKER EMBEDDINGS")
|
print(f"SAGEMAKER EMBEDDINGS")
|
||||||
print(query_result[:5])
|
print(query_result[:5])
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ query_result = embeddings.embed_query(text)
|
||||||
print(f"BEDROCK EMBEDDINGS")
|
print(f"BEDROCK EMBEDDINGS")
|
||||||
print(query_result[:5])
|
print(query_result[:5])
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
|
||||||
|
@ -407,11 +407,11 @@ services:
|
||||||
litellm:
|
litellm:
|
||||||
image: ghcr.io/berriai/litellm:main
|
image: ghcr.io/berriai/litellm:main
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000" # Map the container port to the host, change the host port if necessary
|
- "4000:4000" # Map the container port to the host, change the host port if necessary
|
||||||
volumes:
|
volumes:
|
||||||
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
||||||
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
||||||
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
|
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
|
||||||
|
|
||||||
# ...rest of your docker-compose config if any
|
# ...rest of your docker-compose config if any
|
||||||
```
|
```
|
||||||
|
@ -429,7 +429,7 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
|
||||||
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
|
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
|
||||||
|
|
||||||
|
|
||||||
Your LiteLLM container should be running now on the defined port e.g. `8000`.
|
Your LiteLLM container should be running now on the defined port e.g. `4000`.
|
||||||
|
|
||||||
|
|
||||||
## Using with OpenAI compatible projects
|
## Using with OpenAI compatible projects
|
||||||
|
@ -442,7 +442,7 @@ Set `base_url` to the LiteLLM Proxy server
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -463,7 +463,7 @@ print(response)
|
||||||
```shell
|
```shell
|
||||||
litellm --model gpt-3.5-turbo
|
litellm --model gpt-3.5-turbo
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 1. Clone the repo
|
#### 1. Clone the repo
|
||||||
|
@ -474,9 +474,9 @@ git clone https://github.com/danny-avila/LibreChat.git
|
||||||
|
|
||||||
|
|
||||||
#### 2. Modify Librechat's `docker-compose.yml`
|
#### 2. Modify Librechat's `docker-compose.yml`
|
||||||
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
|
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
|
||||||
```yaml
|
```yaml
|
||||||
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
|
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Save fake OpenAI key in Librechat's `.env`
|
#### 3. Save fake OpenAI key in Librechat's `.env`
|
||||||
|
@ -502,7 +502,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
|
||||||
api_key="IGNORED",
|
api_key="IGNORED",
|
||||||
model="fake-model-name",
|
model="fake-model-name",
|
||||||
context_length=2048, # customize if needed for your model
|
context_length=2048, # customize if needed for your model
|
||||||
api_base="http://localhost:8000" # your proxy server url
|
api_base="http://localhost:4000" # your proxy server url
|
||||||
),
|
),
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -514,7 +514,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
|
||||||
```shell
|
```shell
|
||||||
$ pip install aider
|
$ pip install aider
|
||||||
|
|
||||||
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
|
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="autogen" label="AutoGen">
|
<TabItem value="autogen" label="AutoGen">
|
||||||
|
@ -528,7 +528,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
|
||||||
config_list=[
|
config_list=[
|
||||||
{
|
{
|
||||||
"model": "my-fake-model",
|
"model": "my-fake-model",
|
||||||
"api_base": "http://localhost:8000", #litellm compatible endpoint
|
"api_base": "http://localhost:4000", #litellm compatible endpoint
|
||||||
"api_type": "open_ai",
|
"api_type": "open_ai",
|
||||||
"api_key": "NULL", # just a placeholder
|
"api_key": "NULL", # just a placeholder
|
||||||
}
|
}
|
||||||
|
@ -566,7 +566,7 @@ import guidance
|
||||||
|
|
||||||
# set api_base to your proxy
|
# set api_base to your proxy
|
||||||
# set api_key to anything
|
# set api_key to anything
|
||||||
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
|
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
|
||||||
|
|
||||||
experts = guidance('''
|
experts = guidance('''
|
||||||
{{#system~}}
|
{{#system~}}
|
||||||
|
|
|
@ -45,7 +45,7 @@ litellm_settings:
|
||||||
**Set dynamically**
|
**Set dynamically**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "zephyr-beta",
|
"model": "zephyr-beta",
|
||||||
|
@ -101,7 +101,7 @@ LiteLLM Proxy supports setting a `timeout` per request
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -121,7 +121,7 @@ import openai
|
||||||
|
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
|
|
|
@ -30,7 +30,7 @@ $ litellm /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--data '{
|
--data '{
|
||||||
|
|
|
@ -65,7 +65,7 @@ litellm --config proxy_config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
|
71
docs/my-website/docs/proxy/team_based_routing.md
Normal file
71
docs/my-website/docs/proxy/team_based_routing.md
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# 👥 Team-based Routing
|
||||||
|
|
||||||
|
Route calls to different model groups based on the team-id
|
||||||
|
|
||||||
|
## Config with model group
|
||||||
|
|
||||||
|
Create a config.yaml with 2 model groups + connected postgres db
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo-eu # 👈 Model Group 1
|
||||||
|
litellm_params:
|
||||||
|
model: azure/chatgpt-v-2
|
||||||
|
api_base: os.environ/AZURE_API_BASE_EU
|
||||||
|
api_key: os.environ/AZURE_API_KEY_EU
|
||||||
|
api_version: "2023-07-01-preview"
|
||||||
|
- model_name: gpt-3.5-turbo-worldwide # 👈 Model Group 2
|
||||||
|
litellm_params:
|
||||||
|
model: azure/chatgpt-v-2
|
||||||
|
api_base: os.environ/AZURE_API_BASE
|
||||||
|
api_key: os.environ/AZURE_API_KEY
|
||||||
|
api_version: "2023-07-01-preview"
|
||||||
|
|
||||||
|
general_settings:
|
||||||
|
master_key: sk-1234
|
||||||
|
database_url: "postgresql://..." # 👈 Connect proxy to DB
|
||||||
|
```
|
||||||
|
|
||||||
|
Start proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Create Team with Model Alias
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/team/new' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \ # 👈 Master Key
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"team_alias": "my-new-team_4",
|
||||||
|
"model_aliases": {"gpt-3.5-turbo": "gpt-3.5-turbo-eu"}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Returns team_id: my-team-id
|
||||||
|
```
|
||||||
|
|
||||||
|
## Create Team Key
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://localhost:4000/key/generate' \
|
||||||
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"team_id": "my-team-id", # 👈 YOUR TEAM ID
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Call Model with alias
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer sk-A1L0C3Px2LJl53sF_kTF9A' \
|
||||||
|
--data '{
|
||||||
|
"model": "gpt-3.5-turbo", # 👈 MODEL
|
||||||
|
"messages": [{"role": "system", "content": "You'\''re an expert at writing poems"}, {"role": "user", "content": "Write me a poem"}, {"role": "user", "content": "What'\''s your name?"}],
|
||||||
|
"user": "usha"
|
||||||
|
}'
|
||||||
|
```
|
|
@ -28,12 +28,12 @@ Follow [setup](./virtual_keys.md#setup)
|
||||||
```bash
|
```bash
|
||||||
litellm --config /path/to/config.yaml
|
litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Go to UI
|
### 2. Go to UI
|
||||||
```bash
|
```bash
|
||||||
http://0.0.0.0:8000/ui # <proxy_base_url>/ui
|
http://0.0.0.0:4000/ui # <proxy_base_url>/ui
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -92,7 +92,7 @@ print(response)
|
||||||
Pass `metadata` as part of the request body
|
Pass `metadata` as part of the request body
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -123,7 +123,7 @@ from langchain.prompts.chat import (
|
||||||
from langchain.schema import HumanMessage, SystemMessage
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
chat = ChatOpenAI(
|
chat = ChatOpenAI(
|
||||||
openai_api_base="http://0.0.0.0:8000",
|
openai_api_base="http://0.0.0.0:4000",
|
||||||
model = "gpt-3.5-turbo",
|
model = "gpt-3.5-turbo",
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
extra_body={
|
extra_body={
|
||||||
|
@ -195,7 +195,7 @@ from openai import OpenAI
|
||||||
|
|
||||||
# set base_url to your proxy server
|
# set base_url to your proxy server
|
||||||
# set api_key to send to proxy server
|
# set api_key to send to proxy server
|
||||||
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
|
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
|
||||||
|
|
||||||
response = client.embeddings.create(
|
response = client.embeddings.create(
|
||||||
input=["hello from litellm"],
|
input=["hello from litellm"],
|
||||||
|
@ -209,7 +209,7 @@ print(response)
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
curl --location 'http://0.0.0.0:4000/embeddings' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "text-embedding-ada-002",
|
"model": "text-embedding-ada-002",
|
||||||
|
@ -223,7 +223,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||||
```python
|
```python
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
@ -233,7 +233,7 @@ query_result = embeddings.embed_query(text)
|
||||||
print(f"SAGEMAKER EMBEDDINGS")
|
print(f"SAGEMAKER EMBEDDINGS")
|
||||||
print(query_result[:5])
|
print(query_result[:5])
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
|
||||||
|
@ -242,7 +242,7 @@ query_result = embeddings.embed_query(text)
|
||||||
print(f"BEDROCK EMBEDDINGS")
|
print(f"BEDROCK EMBEDDINGS")
|
||||||
print(query_result[:5])
|
print(query_result[:5])
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
|
||||||
|
|
||||||
text = "This is a test document."
|
text = "This is a test document."
|
||||||
|
|
||||||
|
@ -296,7 +296,7 @@ from openai import OpenAI
|
||||||
|
|
||||||
# set base_url to your proxy server
|
# set base_url to your proxy server
|
||||||
# set api_key to send to proxy server
|
# set api_key to send to proxy server
|
||||||
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
|
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
|
||||||
|
|
||||||
response = client.moderations.create(
|
response = client.moderations.create(
|
||||||
input="hello from litellm",
|
input="hello from litellm",
|
||||||
|
@ -310,7 +310,7 @@ print(response)
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/moderations' \
|
curl --location 'http://0.0.0.0:4000/moderations' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
|
--data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
|
||||||
|
@ -421,7 +421,7 @@ user_config = {
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="sk-1234",
|
api_key="sk-1234",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# send request to `user-azure-instance`
|
# send request to `user-azure-instance`
|
||||||
|
@ -489,7 +489,7 @@ const { OpenAI } = require('openai');
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
apiKey: "sk-1234",
|
apiKey: "sk-1234",
|
||||||
baseURL: "http://0.0.0.0:8000"
|
baseURL: "http://0.0.0.0:4000"
|
||||||
});
|
});
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
|
@ -516,7 +516,7 @@ Here's how to do it:
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="sk-1234",
|
api_key="sk-1234",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -541,7 +541,7 @@ Pass in the litellm_params (E.g. api_key, api_base, etc.) via the `extra_body` p
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="sk-1234",
|
api_key="sk-1234",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -571,7 +571,7 @@ const { OpenAI } = require('openai');
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
apiKey: "sk-1234",
|
apiKey: "sk-1234",
|
||||||
baseURL: "http://0.0.0.0:8000"
|
baseURL: "http://0.0.0.0:4000"
|
||||||
});
|
});
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
|
|
|
@ -44,7 +44,7 @@ litellm /path/to/config.yaml
|
||||||
**Step 3. Send test call**
|
**Step 3. Send test call**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Autherization: Bearer sk-1234' \
|
--header 'Autherization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{
|
--data '{
|
||||||
|
@ -72,7 +72,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
|
|
||||||
#### **Add budgets to users**
|
#### **Add budgets to users**
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://localhost:8000/user/new' \
|
curl --location 'http://localhost:4000/user/new' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||||
|
@ -96,7 +96,7 @@ curl --location 'http://localhost:8000/user/new' \
|
||||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
|
||||||
```
|
```
|
||||||
curl 'http://0.0.0.0:8000/user/new' \
|
curl 'http://0.0.0.0:4000/user/new' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -113,7 +113,7 @@ Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai)
|
||||||
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
|
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
|
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
|
||||||
|
@ -127,7 +127,7 @@ You can:
|
||||||
|
|
||||||
#### **Add budgets to users**
|
#### **Add budgets to users**
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://localhost:8000/team/new' \
|
curl --location 'http://localhost:4000/team/new' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -238,7 +238,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
#### **Add budgets to keys**
|
#### **Add budgets to keys**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -250,7 +250,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
Example Request to `/chat/completions` when key has crossed budget
|
Example Request to `/chat/completions` when key has crossed budget
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--header 'Authorization: Bearer <generated-key>' \
|
--header 'Authorization: Bearer <generated-key>' \
|
||||||
--data ' {
|
--data ' {
|
||||||
|
@ -278,7 +278,7 @@ Expected Response from `/chat/completions` when key has crossed budget
|
||||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
|
|
||||||
```
|
```
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -310,7 +310,7 @@ By default the `model_max_budget` is set to `{}` and is not checked for keys
|
||||||
#### **Add model specific budgets to keys**
|
#### **Add model specific budgets to keys**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -335,7 +335,7 @@ Use `/user/new`, to persist rate limits across multiple keys.
|
||||||
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/user/new' \
|
curl --location 'http://0.0.0.0:4000/user/new' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||||
|
@ -359,7 +359,7 @@ curl --location 'http://0.0.0.0:8000/user/new' \
|
||||||
Use `/key/generate`, if you want them for just that key.
|
Use `/key/generate`, if you want them for just that key.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
|
||||||
|
@ -401,7 +401,7 @@ model_list:
|
||||||
**Step 2. Create key with access group**
|
**Step 2. Create key with access group**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://localhost:8000/user/new' \
|
curl --location 'http://localhost:4000/user/new' \
|
||||||
-H 'Authorization: Bearer <your-master-key>' \
|
-H 'Authorization: Bearer <your-master-key>' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
||||||
|
@ -414,7 +414,7 @@ curl --location 'http://localhost:8000/user/new' \
|
||||||
Just include user_id in the `/key/generate` request.
|
Just include user_id in the `/key/generate` request.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'
|
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'
|
||||||
|
|
|
@ -59,7 +59,7 @@ litellm --config /path/to/config.yaml
|
||||||
**Step 3: Generate temporary keys**
|
**Step 3: Generate temporary keys**
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
|
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
|
||||||
|
@ -70,7 +70,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
"metadata": {"user": "ishaan@berri.ai"},
|
"metadata": {"user": "ishaan@berri.ai"},
|
||||||
"team_id": "core-infra",
|
"team_id": "core-infra",
|
||||||
"max_budget": 10,
|
"max_budget": 10,
|
||||||
|
"soft_budget": 5,
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -93,6 +94,7 @@ Request Params:
|
||||||
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||||
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||||
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||||
|
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
|
||||||
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
|
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
|
||||||
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||||
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||||
|
@ -103,7 +105,7 @@ Request Params:
|
||||||
```python
|
```python
|
||||||
{
|
{
|
||||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
||||||
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
|
"expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
|
||||||
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
||||||
...
|
...
|
||||||
}
|
}
|
||||||
|
@ -145,7 +147,7 @@ model_list:
|
||||||
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
|
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST "https://0.0.0.0:8000/key/generate" \
|
curl -X POST "https://0.0.0.0:4000/key/generate" \
|
||||||
-H "Authorization: Bearer <your-master-key>" \
|
-H "Authorization: Bearer <your-master-key>" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
|
@ -180,7 +182,7 @@ model_list:
|
||||||
**Step 2. Create key with access group**
|
**Step 2. Create key with access group**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://localhost:8000/key/generate' \
|
curl --location 'http://localhost:4000/key/generate' \
|
||||||
-H 'Authorization: Bearer <your-master-key>' \
|
-H 'Authorization: Bearer <your-master-key>' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
-d '{"models": ["beta-models"], # 👈 Model Access Group
|
||||||
|
@ -192,7 +194,7 @@ curl --location 'http://localhost:8000/key/generate' \
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
```shell
|
```shell
|
||||||
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
|
curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
|
||||||
-H "Authorization: Bearer sk-1234"
|
-H "Authorization: Bearer sk-1234"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -226,7 +228,7 @@ Request Params:
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/update' \
|
curl 'http://0.0.0.0:4000/key/update' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -264,7 +266,7 @@ Request Params:
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/delete' \
|
curl 'http://0.0.0.0:4000/key/delete' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -498,7 +500,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{
|
--data-raw '{
|
||||||
|
@ -515,7 +517,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
||||||
Example Request to `/chat/completions` when key has crossed budget
|
Example Request to `/chat/completions` when key has crossed budget
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
|
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
|
||||||
--data ' {
|
--data ' {
|
||||||
|
@ -543,10 +545,10 @@ Expected Response from `/chat/completions` when key has crossed budget
|
||||||
|
|
||||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||||
|
|
||||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
|
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://localhost:8000/user/new' \
|
curl --location 'http://localhost:4000/user/new' \
|
||||||
--header 'Authorization: Bearer <your-master-key>' \
|
--header 'Authorization: Bearer <your-master-key>' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||||
|
@ -569,7 +571,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||||
You can get spend for a key by using the `/key/info` endpoint.
|
You can get spend for a key by using the `/key/info` endpoint.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
|
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
|
||||||
-X GET \
|
-X GET \
|
||||||
-H 'Authorization: Bearer <your-master-key>'
|
-H 'Authorization: Bearer <your-master-key>'
|
||||||
```
|
```
|
||||||
|
@ -769,7 +771,7 @@ general_settings:
|
||||||
#### Step 3. Generate Key
|
#### Step 3. Generate Key
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
||||||
|
|
|
@ -22,7 +22,7 @@ $ pip install 'litellm[proxy]'
|
||||||
```shell
|
```shell
|
||||||
$ litellm --model huggingface/bigcode/starcoder
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
### Test
|
### Test
|
||||||
|
@ -39,7 +39,7 @@ This will now automatically route any requests for gpt-3.5-turbo to bigcode star
|
||||||
<TabItem value="Curl" label="Curl Request">
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -59,7 +59,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -246,7 +246,7 @@ Set `base_url` to the LiteLLM Proxy server
|
||||||
import openai
|
import openai
|
||||||
client = openai.OpenAI(
|
client = openai.OpenAI(
|
||||||
api_key="anything",
|
api_key="anything",
|
||||||
base_url="http://0.0.0.0:8000"
|
base_url="http://0.0.0.0:4000"
|
||||||
)
|
)
|
||||||
|
|
||||||
# request sent to model set on litellm proxy, `litellm --model`
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
@ -267,7 +267,7 @@ print(response)
|
||||||
```shell
|
```shell
|
||||||
litellm --model gpt-3.5-turbo
|
litellm --model gpt-3.5-turbo
|
||||||
|
|
||||||
#INFO: Proxy running on http://0.0.0.0:8000
|
#INFO: Proxy running on http://0.0.0.0:4000
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 1. Clone the repo
|
#### 1. Clone the repo
|
||||||
|
@ -278,9 +278,9 @@ git clone https://github.com/danny-avila/LibreChat.git
|
||||||
|
|
||||||
|
|
||||||
#### 2. Modify Librechat's `docker-compose.yml`
|
#### 2. Modify Librechat's `docker-compose.yml`
|
||||||
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
|
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
|
||||||
```yaml
|
```yaml
|
||||||
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
|
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Save fake OpenAI key in Librechat's `.env`
|
#### 3. Save fake OpenAI key in Librechat's `.env`
|
||||||
|
@ -306,7 +306,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
|
||||||
api_key="IGNORED",
|
api_key="IGNORED",
|
||||||
model="fake-model-name",
|
model="fake-model-name",
|
||||||
context_length=2048, # customize if needed for your model
|
context_length=2048, # customize if needed for your model
|
||||||
api_base="http://localhost:8000" # your proxy server url
|
api_base="http://localhost:4000" # your proxy server url
|
||||||
),
|
),
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -318,7 +318,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
|
||||||
```shell
|
```shell
|
||||||
$ pip install aider
|
$ pip install aider
|
||||||
|
|
||||||
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
|
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="autogen" label="AutoGen">
|
<TabItem value="autogen" label="AutoGen">
|
||||||
|
@ -332,7 +332,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
|
||||||
config_list=[
|
config_list=[
|
||||||
{
|
{
|
||||||
"model": "my-fake-model",
|
"model": "my-fake-model",
|
||||||
"api_base": "http://localhost:8000", #litellm compatible endpoint
|
"api_base": "http://localhost:4000", #litellm compatible endpoint
|
||||||
"api_type": "open_ai",
|
"api_type": "open_ai",
|
||||||
"api_key": "NULL", # just a placeholder
|
"api_key": "NULL", # just a placeholder
|
||||||
}
|
}
|
||||||
|
@ -370,7 +370,7 @@ import guidance
|
||||||
|
|
||||||
# set api_base to your proxy
|
# set api_base to your proxy
|
||||||
# set api_key to anything
|
# set api_key to anything
|
||||||
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
|
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
|
||||||
|
|
||||||
experts = guidance('''
|
experts = guidance('''
|
||||||
{{#system~}}
|
{{#system~}}
|
||||||
|
@ -479,7 +479,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
#### Step 3: Use proxy
|
#### Step 3: Use proxy
|
||||||
Curl Command
|
Curl Command
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "zephyr-alpha",
|
"model": "zephyr-alpha",
|
||||||
|
@ -529,7 +529,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
#### Step 3: Use proxy
|
#### Step 3: Use proxy
|
||||||
Curl Command
|
Curl Command
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -586,7 +586,7 @@ litellm_settings:
|
||||||
**Set dynamically**
|
**Set dynamically**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Content-Type: application/json' \
|
--header 'Content-Type: application/json' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "zephyr-beta",
|
"model": "zephyr-beta",
|
||||||
|
@ -615,7 +615,7 @@ model_list:
|
||||||
- model_name: custom_embedding_model
|
- model_name: custom_embedding_model
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
|
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
|
||||||
api_base: http://0.0.0.0:8000/
|
api_base: http://0.0.0.0:4000/
|
||||||
- model_name: custom_embedding_model
|
- model_name: custom_embedding_model
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
|
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
|
||||||
|
@ -665,7 +665,7 @@ litellm --config /path/to/config.yaml
|
||||||
**Step 3: Generate temporary keys**
|
**Step 3: Generate temporary keys**
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl 'http://0.0.0.0:8000/key/generate' \
|
curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
--h 'Authorization: Bearer sk-1234' \
|
--h 'Authorization: Bearer sk-1234' \
|
||||||
--d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}'
|
--d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}'
|
||||||
```
|
```
|
||||||
|
@ -719,7 +719,7 @@ model_list:
|
||||||
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
|
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST "https://0.0.0.0:8000/key/generate" \
|
curl -X POST "https://0.0.0.0:4000/key/generate" \
|
||||||
-H "Authorization: Bearer sk-1234" \
|
-H "Authorization: Bearer sk-1234" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
|
@ -737,7 +737,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
|
||||||
You can get spend for a key by using the `/key/info` endpoint.
|
You can get spend for a key by using the `/key/info` endpoint.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
|
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
|
||||||
-X GET \
|
-X GET \
|
||||||
-H 'Authorization: Bearer <your-master-key>'
|
-H 'Authorization: Bearer <your-master-key>'
|
||||||
```
|
```
|
||||||
|
@ -868,7 +868,7 @@ $ litellm --config /path/to/config.yaml
|
||||||
#### Using Caching
|
#### Using Caching
|
||||||
Send the same request twice:
|
Send the same request twice:
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -876,7 +876,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
"temperature": 0.7
|
"temperature": 0.7
|
||||||
}'
|
}'
|
||||||
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -889,7 +889,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
Caching can be switched on/off per `/chat/completions` request
|
Caching can be switched on/off per `/chat/completions` request
|
||||||
- Caching **on** for completion - pass `caching=True`:
|
- Caching **on** for completion - pass `caching=True`:
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -900,7 +900,7 @@ Caching can be switched on/off per `/chat/completions` request
|
||||||
```
|
```
|
||||||
- Caching **off** for completion - pass `caching=False`:
|
- Caching **off** for completion - pass `caching=False`:
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:4000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -963,10 +963,10 @@ https://api.openai.com/v1/chat/completions \
|
||||||
Use this to health check all LLMs defined in your config.yaml
|
Use this to health check all LLMs defined in your config.yaml
|
||||||
#### Request
|
#### Request
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/health'
|
curl --location 'http://0.0.0.0:4000/health'
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
|
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
|
||||||
```
|
```
|
||||||
litellm --health
|
litellm --health
|
||||||
```
|
```
|
||||||
|
@ -1087,7 +1087,7 @@ litellm -config config.yaml
|
||||||
|
|
||||||
#### Run a test request to Proxy
|
#### Run a test request to Proxy
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
--header 'Authorization: Bearer sk-1244' \
|
--header 'Authorization: Bearer sk-1244' \
|
||||||
--data ' {
|
--data ' {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
|
@ -1213,7 +1213,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
|
||||||
```
|
```
|
||||||
|
|
||||||
#### --port
|
#### --port
|
||||||
- **Default:** `8000`
|
- **Default:** `4000`
|
||||||
- The port to bind the server to.
|
- The port to bind the server to.
|
||||||
- **Usage:**
|
- **Usage:**
|
||||||
```shell
|
```shell
|
||||||
|
|
BIN
docs/my-website/img/budget_alerts.png
Normal file
BIN
docs/my-website/img/budget_alerts.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 89 KiB |
BIN
docs/my-website/img/create_key.png
Normal file
BIN
docs/my-website/img/create_key.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 140 KiB |
BIN
docs/my-website/img/locust.png
Normal file
BIN
docs/my-website/img/locust.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 109 KiB |
BIN
docs/my-website/img/test_alert.png
Normal file
BIN
docs/my-website/img/test_alert.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 203 KiB |
|
@ -39,17 +39,18 @@ const sidebars = {
|
||||||
"proxy/user_keys",
|
"proxy/user_keys",
|
||||||
"proxy/virtual_keys",
|
"proxy/virtual_keys",
|
||||||
"proxy/users",
|
"proxy/users",
|
||||||
|
"proxy/team_based_routing",
|
||||||
"proxy/ui",
|
"proxy/ui",
|
||||||
"proxy/metrics",
|
"proxy/budget_alerts",
|
||||||
"proxy/model_management",
|
|
||||||
"proxy/health",
|
|
||||||
"proxy/debugging",
|
|
||||||
"proxy/pii_masking",
|
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
label: "🔥 Load Balancing",
|
label: "🔥 Load Balancing",
|
||||||
items: ["proxy/load_balancing", "proxy/reliability"],
|
items: ["proxy/load_balancing", "proxy/reliability"],
|
||||||
},
|
},
|
||||||
|
"proxy/model_management",
|
||||||
|
"proxy/health",
|
||||||
|
"proxy/debugging",
|
||||||
|
"proxy/pii_masking",
|
||||||
"proxy/caching",
|
"proxy/caching",
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
|
@ -90,12 +91,13 @@ const sidebars = {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "category",
|
type: "category",
|
||||||
label: "Embedding(), Moderation(), Image Generation()",
|
label: "Embedding(), Moderation(), Image Generation(), Audio Transcriptions()",
|
||||||
items: [
|
items: [
|
||||||
"embedding/supported_embedding",
|
"embedding/supported_embedding",
|
||||||
"embedding/async_embedding",
|
"embedding/async_embedding",
|
||||||
"embedding/moderation",
|
"embedding/moderation",
|
||||||
"image_generation",
|
"image_generation",
|
||||||
|
"audio_transcription"
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
44
enterprise/cloudformation_stack/litellm.yaml
Normal file
44
enterprise/cloudformation_stack/litellm.yaml
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
Resources:
|
||||||
|
LiteLLMServer:
|
||||||
|
Type: AWS::EC2::Instance
|
||||||
|
Properties:
|
||||||
|
AvailabilityZone: us-east-1a
|
||||||
|
ImageId: ami-0f403e3180720dd7e
|
||||||
|
InstanceType: t2.micro
|
||||||
|
|
||||||
|
LiteLLMServerAutoScalingGroup:
|
||||||
|
Type: AWS::AutoScaling::AutoScalingGroup
|
||||||
|
Properties:
|
||||||
|
AvailabilityZones:
|
||||||
|
- us-east-1a
|
||||||
|
LaunchConfigurationName: !Ref LiteLLMServerLaunchConfig
|
||||||
|
MinSize: 1
|
||||||
|
MaxSize: 3
|
||||||
|
DesiredCapacity: 1
|
||||||
|
HealthCheckGracePeriod: 300
|
||||||
|
|
||||||
|
LiteLLMServerLaunchConfig:
|
||||||
|
Type: AWS::AutoScaling::LaunchConfiguration
|
||||||
|
Properties:
|
||||||
|
ImageId: ami-0f403e3180720dd7e # Replace with your desired AMI ID
|
||||||
|
InstanceType: t2.micro
|
||||||
|
|
||||||
|
LiteLLMServerScalingPolicy:
|
||||||
|
Type: AWS::AutoScaling::ScalingPolicy
|
||||||
|
Properties:
|
||||||
|
AutoScalingGroupName: !Ref LiteLLMServerAutoScalingGroup
|
||||||
|
PolicyType: TargetTrackingScaling
|
||||||
|
TargetTrackingConfiguration:
|
||||||
|
PredefinedMetricSpecification:
|
||||||
|
PredefinedMetricType: ASGAverageCPUUtilization
|
||||||
|
TargetValue: 60.0
|
||||||
|
|
||||||
|
LiteLLMDB:
|
||||||
|
Type: AWS::RDS::DBInstance
|
||||||
|
Properties:
|
||||||
|
AllocatedStorage: 20
|
||||||
|
Engine: postgres
|
||||||
|
MasterUsername: litellmAdmin
|
||||||
|
MasterUserPassword: litellmPassword
|
||||||
|
DBInstanceClass: db.t3.micro
|
||||||
|
AvailabilityZone: us-east-1a
|
|
@ -79,6 +79,9 @@ max_budget: float = 0.0 # set the max budget across all providers
|
||||||
budget_duration: Optional[str] = (
|
budget_duration: Optional[str] = (
|
||||||
None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
)
|
)
|
||||||
|
default_soft_budget: float = (
|
||||||
|
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
||||||
|
)
|
||||||
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
|
_openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
|
||||||
_openai_completion_params = [
|
_openai_completion_params = [
|
||||||
"functions",
|
"functions",
|
||||||
|
@ -567,9 +570,11 @@ from .utils import (
|
||||||
_calculate_retry_after,
|
_calculate_retry_after,
|
||||||
_should_retry,
|
_should_retry,
|
||||||
get_secret,
|
get_secret,
|
||||||
|
get_supported_openai_params,
|
||||||
)
|
)
|
||||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||||
from .llms.anthropic import AnthropicConfig
|
from .llms.anthropic import AnthropicConfig
|
||||||
|
from .llms.anthropic_text import AnthropicTextConfig
|
||||||
from .llms.replicate import ReplicateConfig
|
from .llms.replicate import ReplicateConfig
|
||||||
from .llms.cohere import CohereConfig
|
from .llms.cohere import CohereConfig
|
||||||
from .llms.ai21 import AI21Config
|
from .llms.ai21 import AI21Config
|
||||||
|
@ -583,14 +588,17 @@ from .llms.petals import PetalsConfig
|
||||||
from .llms.vertex_ai import VertexAIConfig
|
from .llms.vertex_ai import VertexAIConfig
|
||||||
from .llms.sagemaker import SagemakerConfig
|
from .llms.sagemaker import SagemakerConfig
|
||||||
from .llms.ollama import OllamaConfig
|
from .llms.ollama import OllamaConfig
|
||||||
|
from .llms.ollama_chat import OllamaChatConfig
|
||||||
from .llms.maritalk import MaritTalkConfig
|
from .llms.maritalk import MaritTalkConfig
|
||||||
from .llms.bedrock import (
|
from .llms.bedrock import (
|
||||||
AmazonTitanConfig,
|
AmazonTitanConfig,
|
||||||
AmazonAI21Config,
|
AmazonAI21Config,
|
||||||
AmazonAnthropicConfig,
|
AmazonAnthropicConfig,
|
||||||
|
AmazonAnthropicClaude3Config,
|
||||||
AmazonCohereConfig,
|
AmazonCohereConfig,
|
||||||
AmazonLlamaConfig,
|
AmazonLlamaConfig,
|
||||||
AmazonStabilityConfig,
|
AmazonStabilityConfig,
|
||||||
|
AmazonMistralConfig,
|
||||||
)
|
)
|
||||||
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
||||||
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
||||||
|
|
|
@ -31,6 +31,18 @@ def _turn_on_debug():
|
||||||
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
||||||
|
|
||||||
|
|
||||||
|
def _disable_debugging():
|
||||||
|
verbose_logger.disabled = True
|
||||||
|
verbose_router_logger.disabled = True
|
||||||
|
verbose_proxy_logger.disabled = True
|
||||||
|
|
||||||
|
|
||||||
|
def _enable_debugging():
|
||||||
|
verbose_logger.disabled = False
|
||||||
|
verbose_router_logger.disabled = False
|
||||||
|
verbose_proxy_logger.disabled = False
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
if set_verbose:
|
if set_verbose:
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
import litellm
|
import litellm
|
||||||
import time, logging, asyncio
|
import time, logging, asyncio
|
||||||
import json, traceback, ast, hashlib
|
import json, traceback, ast, hashlib
|
||||||
from typing import Optional, Literal, List, Union, Any
|
from typing import Optional, Literal, List, Union, Any, BinaryIO
|
||||||
from openai._models import BaseModel as OpenAIObject
|
from openai._models import BaseModel as OpenAIObject
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
|
|
||||||
|
@ -48,6 +48,7 @@ class InMemoryCache(BaseCache):
|
||||||
self.ttl_dict = {}
|
self.ttl_dict = {}
|
||||||
|
|
||||||
def set_cache(self, key, value, **kwargs):
|
def set_cache(self, key, value, **kwargs):
|
||||||
|
print_verbose("InMemoryCache: set_cache")
|
||||||
self.cache_dict[key] = value
|
self.cache_dict[key] = value
|
||||||
if "ttl" in kwargs:
|
if "ttl" in kwargs:
|
||||||
self.ttl_dict[key] = time.time() + kwargs["ttl"]
|
self.ttl_dict[key] = time.time() + kwargs["ttl"]
|
||||||
|
@ -572,6 +573,7 @@ class S3Cache(BaseCache):
|
||||||
self.bucket_name = s3_bucket_name
|
self.bucket_name = s3_bucket_name
|
||||||
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
|
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
|
||||||
# Create an S3 client with custom endpoint URL
|
# Create an S3 client with custom endpoint URL
|
||||||
|
|
||||||
self.s3_client = boto3.client(
|
self.s3_client = boto3.client(
|
||||||
"s3",
|
"s3",
|
||||||
region_name=s3_region_name,
|
region_name=s3_region_name,
|
||||||
|
@ -763,8 +765,24 @@ class Cache:
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
similarity_threshold: Optional[float] = None,
|
similarity_threshold: Optional[float] = None,
|
||||||
supported_call_types: Optional[
|
supported_call_types: Optional[
|
||||||
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
|
List[
|
||||||
] = ["completion", "acompletion", "embedding", "aembedding"],
|
Literal[
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
] = [
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
],
|
||||||
# s3 Bucket, boto3 configuration
|
# s3 Bucket, boto3 configuration
|
||||||
s3_bucket_name: Optional[str] = None,
|
s3_bucket_name: Optional[str] = None,
|
||||||
s3_region_name: Optional[str] = None,
|
s3_region_name: Optional[str] = None,
|
||||||
|
@ -776,6 +794,7 @@ class Cache:
|
||||||
s3_aws_secret_access_key: Optional[str] = None,
|
s3_aws_secret_access_key: Optional[str] = None,
|
||||||
s3_aws_session_token: Optional[str] = None,
|
s3_aws_session_token: Optional[str] = None,
|
||||||
s3_config: Optional[Any] = None,
|
s3_config: Optional[Any] = None,
|
||||||
|
s3_path: Optional[str] = None,
|
||||||
redis_semantic_cache_use_async=False,
|
redis_semantic_cache_use_async=False,
|
||||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
@ -825,6 +844,7 @@ class Cache:
|
||||||
s3_aws_secret_access_key=s3_aws_secret_access_key,
|
s3_aws_secret_access_key=s3_aws_secret_access_key,
|
||||||
s3_aws_session_token=s3_aws_session_token,
|
s3_aws_session_token=s3_aws_session_token,
|
||||||
s3_config=s3_config,
|
s3_config=s3_config,
|
||||||
|
s3_path=s3_path,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
if "cache" not in litellm.input_callback:
|
if "cache" not in litellm.input_callback:
|
||||||
|
@ -877,9 +897,14 @@ class Cache:
|
||||||
"input",
|
"input",
|
||||||
"encoding_format",
|
"encoding_format",
|
||||||
] # embedding kwargs = model, input, user, encoding_format. Model, user are checked in completion_kwargs
|
] # embedding kwargs = model, input, user, encoding_format. Model, user are checked in completion_kwargs
|
||||||
|
transcription_only_kwargs = [
|
||||||
|
"file",
|
||||||
|
"language",
|
||||||
|
]
|
||||||
# combined_kwargs - NEEDS to be ordered across get_cache_key(). Do not use a set()
|
# combined_kwargs - NEEDS to be ordered across get_cache_key(). Do not use a set()
|
||||||
combined_kwargs = completion_kwargs + embedding_only_kwargs
|
combined_kwargs = (
|
||||||
|
completion_kwargs + embedding_only_kwargs + transcription_only_kwargs
|
||||||
|
)
|
||||||
for param in combined_kwargs:
|
for param in combined_kwargs:
|
||||||
# ignore litellm params here
|
# ignore litellm params here
|
||||||
if param in kwargs:
|
if param in kwargs:
|
||||||
|
@ -911,6 +936,17 @@ class Cache:
|
||||||
param_value = (
|
param_value = (
|
||||||
caching_group or model_group or kwargs[param]
|
caching_group or model_group or kwargs[param]
|
||||||
) # use caching_group, if set then model_group if it exists, else use kwargs["model"]
|
) # use caching_group, if set then model_group if it exists, else use kwargs["model"]
|
||||||
|
elif param == "file":
|
||||||
|
metadata_file_name = kwargs.get("metadata", {}).get(
|
||||||
|
"file_name", None
|
||||||
|
)
|
||||||
|
litellm_params_file_name = kwargs.get("litellm_params", {}).get(
|
||||||
|
"file_name", None
|
||||||
|
)
|
||||||
|
if metadata_file_name is not None:
|
||||||
|
param_value = metadata_file_name
|
||||||
|
elif litellm_params_file_name is not None:
|
||||||
|
param_value = litellm_params_file_name
|
||||||
else:
|
else:
|
||||||
if kwargs[param] is None:
|
if kwargs[param] is None:
|
||||||
continue # ignore None params
|
continue # ignore None params
|
||||||
|
@ -1140,8 +1176,24 @@ def enable_cache(
|
||||||
port: Optional[str] = None,
|
port: Optional[str] = None,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
supported_call_types: Optional[
|
supported_call_types: Optional[
|
||||||
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
|
List[
|
||||||
] = ["completion", "acompletion", "embedding", "aembedding"],
|
Literal[
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
] = [
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -1189,8 +1241,24 @@ def update_cache(
|
||||||
port: Optional[str] = None,
|
port: Optional[str] = None,
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
supported_call_types: Optional[
|
supported_call_types: Optional[
|
||||||
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
|
List[
|
||||||
] = ["completion", "acompletion", "embedding", "aembedding"],
|
Literal[
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
] = [
|
||||||
|
"completion",
|
||||||
|
"acompletion",
|
||||||
|
"embedding",
|
||||||
|
"aembedding",
|
||||||
|
"atranscription",
|
||||||
|
"transcription",
|
||||||
|
],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
||||||
start_time,
|
start_time,
|
||||||
end_time,
|
end_time,
|
||||||
)
|
)
|
||||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
|
||||||
except:
|
except:
|
||||||
# traceback.print_exc()
|
# traceback.print_exc()
|
||||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||||
|
@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
||||||
start_time,
|
start_time,
|
||||||
end_time,
|
end_time,
|
||||||
)
|
)
|
||||||
print_verbose(f"Custom Logger - final response object: {response_obj}")
|
|
||||||
except:
|
except:
|
||||||
# traceback.print_exc()
|
# traceback.print_exc()
|
||||||
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
|
||||||
|
|
|
@ -265,8 +265,14 @@ class LangFuseLogger:
|
||||||
|
|
||||||
cost = kwargs.get("response_cost", None)
|
cost = kwargs.get("response_cost", None)
|
||||||
print_verbose(f"trace: {cost}")
|
print_verbose(f"trace: {cost}")
|
||||||
if supports_tags:
|
|
||||||
|
# Clean Metadata before logging - never log raw metadata
|
||||||
|
# the raw metadata can contain circular references which leads to infinite recursion
|
||||||
|
# we clean out all extra litellm metadata params before logging
|
||||||
|
clean_metadata = {}
|
||||||
|
if isinstance(metadata, dict):
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
|
# generate langfuse tags
|
||||||
if key in [
|
if key in [
|
||||||
"user_api_key",
|
"user_api_key",
|
||||||
"user_api_key_user_id",
|
"user_api_key_user_id",
|
||||||
|
@ -274,6 +280,19 @@ class LangFuseLogger:
|
||||||
"semantic-similarity",
|
"semantic-similarity",
|
||||||
]:
|
]:
|
||||||
tags.append(f"{key}:{value}")
|
tags.append(f"{key}:{value}")
|
||||||
|
|
||||||
|
# clean litellm metadata before logging
|
||||||
|
if key in [
|
||||||
|
"headers",
|
||||||
|
"endpoint",
|
||||||
|
"caching_groups",
|
||||||
|
"previous_models",
|
||||||
|
]:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
clean_metadata[key] = value
|
||||||
|
|
||||||
|
if supports_tags:
|
||||||
if "cache_hit" in kwargs:
|
if "cache_hit" in kwargs:
|
||||||
if kwargs["cache_hit"] is None:
|
if kwargs["cache_hit"] is None:
|
||||||
kwargs["cache_hit"] = False
|
kwargs["cache_hit"] = False
|
||||||
|
@ -301,7 +320,7 @@ class LangFuseLogger:
|
||||||
"input": input,
|
"input": input,
|
||||||
"output": output,
|
"output": output,
|
||||||
"usage": usage,
|
"usage": usage,
|
||||||
"metadata": metadata,
|
"metadata": clean_metadata,
|
||||||
"level": level,
|
"level": level,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,23 @@ class S3Logger:
|
||||||
usage = response_obj["usage"]
|
usage = response_obj["usage"]
|
||||||
id = response_obj.get("id", str(uuid.uuid4()))
|
id = response_obj.get("id", str(uuid.uuid4()))
|
||||||
|
|
||||||
|
# Clean Metadata before logging - never log raw metadata
|
||||||
|
# the raw metadata can contain circular references which leads to infinite recursion
|
||||||
|
# we clean out all extra litellm metadata params before logging
|
||||||
|
clean_metadata = {}
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
for key, value in metadata.items():
|
||||||
|
# clean litellm metadata before logging
|
||||||
|
if key in [
|
||||||
|
"headers",
|
||||||
|
"endpoint",
|
||||||
|
"caching_groups",
|
||||||
|
"previous_models",
|
||||||
|
]:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
clean_metadata[key] = value
|
||||||
|
|
||||||
# Build the initial payload
|
# Build the initial payload
|
||||||
payload = {
|
payload = {
|
||||||
"id": id,
|
"id": id,
|
||||||
|
@ -117,7 +134,7 @@ class S3Logger:
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
"response": response_obj,
|
"response": response_obj,
|
||||||
"usage": usage,
|
"usage": usage,
|
||||||
"metadata": metadata,
|
"metadata": clean_metadata,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Ensure everything in the payload is converted to str
|
# Ensure everything in the payload is converted to str
|
||||||
|
|
|
@ -77,9 +77,9 @@ class AlephAlphaConfig:
|
||||||
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
|
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
maximum_tokens: Optional[
|
maximum_tokens: Optional[int] = (
|
||||||
int
|
litellm.max_tokens
|
||||||
] = litellm.max_tokens # aleph alpha requires max tokens
|
) # aleph alpha requires max tokens
|
||||||
minimum_tokens: Optional[int] = None
|
minimum_tokens: Optional[int] = None
|
||||||
echo: Optional[bool] = None
|
echo: Optional[bool] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
|
@ -285,7 +285,10 @@ def completion(
|
||||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
prompt_tokens = len(encoding.encode(prompt))
|
prompt_tokens = len(encoding.encode(prompt))
|
||||||
completion_tokens = len(
|
completion_tokens = len(
|
||||||
encoding.encode(model_response["choices"][0]["message"]["content"])
|
encoding.encode(
|
||||||
|
model_response["choices"][0]["message"]["content"],
|
||||||
|
disallowed_special=(),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
|
|
|
@ -1,12 +1,18 @@
|
||||||
import os, types
|
import os, types
|
||||||
import json
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import requests
|
import requests, copy
|
||||||
import time
|
import time, uuid
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
from litellm.utils import ModelResponse, Usage
|
from litellm.utils import ModelResponse, Usage, map_finish_reason
|
||||||
import litellm
|
import litellm
|
||||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
from .prompt_templates.factory import (
|
||||||
|
prompt_factory,
|
||||||
|
custom_prompt,
|
||||||
|
construct_tool_use_system_prompt,
|
||||||
|
extract_between_tags,
|
||||||
|
parse_xml_params,
|
||||||
|
)
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +26,7 @@ class AnthropicError(Exception):
|
||||||
self.status_code = status_code
|
self.status_code = status_code
|
||||||
self.message = message
|
self.message = message
|
||||||
self.request = httpx.Request(
|
self.request = httpx.Request(
|
||||||
method="POST", url="https://api.anthropic.com/v1/complete"
|
method="POST", url="https://api.anthropic.com/v1/messages"
|
||||||
)
|
)
|
||||||
self.response = httpx.Response(status_code=status_code, request=self.request)
|
self.response = httpx.Response(status_code=status_code, request=self.request)
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
@ -35,23 +41,23 @@ class AnthropicConfig:
|
||||||
to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
|
to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
max_tokens_to_sample: Optional[
|
max_tokens: Optional[int] = litellm.max_tokens # anthropic requires a default
|
||||||
int
|
|
||||||
] = litellm.max_tokens # anthropic requires a default
|
|
||||||
stop_sequences: Optional[list] = None
|
stop_sequences: Optional[list] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
top_p: Optional[int] = None
|
top_p: Optional[int] = None
|
||||||
top_k: Optional[int] = None
|
top_k: Optional[int] = None
|
||||||
metadata: Optional[dict] = None
|
metadata: Optional[dict] = None
|
||||||
|
system: Optional[str] = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default
|
max_tokens: Optional[int] = 256, # anthropic requires a default
|
||||||
stop_sequences: Optional[list] = None,
|
stop_sequences: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
top_k: Optional[int] = None,
|
top_k: Optional[int] = None,
|
||||||
metadata: Optional[dict] = None,
|
metadata: Optional[dict] = None,
|
||||||
|
system: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
locals_ = locals()
|
locals_ = locals()
|
||||||
for key, value in locals_.items():
|
for key, value in locals_.items():
|
||||||
|
@ -110,6 +116,8 @@ def completion(
|
||||||
headers={},
|
headers={},
|
||||||
):
|
):
|
||||||
headers = validate_environment(api_key, headers)
|
headers = validate_environment(api_key, headers)
|
||||||
|
_is_function_call = False
|
||||||
|
messages = copy.deepcopy(messages)
|
||||||
if model in custom_prompt_dict:
|
if model in custom_prompt_dict:
|
||||||
# check if the model has a registered custom prompt
|
# check if the model has a registered custom prompt
|
||||||
model_prompt_details = custom_prompt_dict[model]
|
model_prompt_details = custom_prompt_dict[model]
|
||||||
|
@ -120,7 +128,17 @@ def completion(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt = prompt_factory(
|
# Separate system prompt from rest of message
|
||||||
|
system_prompt_idx: Optional[int] = None
|
||||||
|
for idx, message in enumerate(messages):
|
||||||
|
if message["role"] == "system":
|
||||||
|
optional_params["system"] = message["content"]
|
||||||
|
system_prompt_idx = idx
|
||||||
|
break
|
||||||
|
if system_prompt_idx is not None:
|
||||||
|
messages.pop(system_prompt_idx)
|
||||||
|
# Format rest of message according to anthropic guidelines
|
||||||
|
messages = prompt_factory(
|
||||||
model=model, messages=messages, custom_llm_provider="anthropic"
|
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -132,15 +150,26 @@ def completion(
|
||||||
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
optional_params[k] = v
|
optional_params[k] = v
|
||||||
|
|
||||||
|
## Handle Tool Calling
|
||||||
|
if "tools" in optional_params:
|
||||||
|
_is_function_call = True
|
||||||
|
tool_calling_system_prompt = construct_tool_use_system_prompt(
|
||||||
|
tools=optional_params["tools"]
|
||||||
|
)
|
||||||
|
optional_params["system"] = (
|
||||||
|
optional_params.get("system", "\n") + tool_calling_system_prompt
|
||||||
|
) # add the anthropic tool calling prompt to the system prompt
|
||||||
|
optional_params.pop("tools")
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"model": model,
|
"model": model,
|
||||||
"prompt": prompt,
|
"messages": messages,
|
||||||
**optional_params,
|
**optional_params,
|
||||||
}
|
}
|
||||||
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=prompt,
|
input=messages,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
additional_args={
|
additional_args={
|
||||||
"complete_input_dict": data,
|
"complete_input_dict": data,
|
||||||
|
@ -173,7 +202,7 @@ def completion(
|
||||||
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
input=prompt,
|
input=messages,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
original_response=response.text,
|
original_response=response.text,
|
||||||
additional_args={"complete_input_dict": data},
|
additional_args={"complete_input_dict": data},
|
||||||
|
@ -191,20 +220,45 @@ def completion(
|
||||||
message=str(completion_response["error"]),
|
message=str(completion_response["error"]),
|
||||||
status_code=response.status_code,
|
status_code=response.status_code,
|
||||||
)
|
)
|
||||||
|
elif len(completion_response["content"]) == 0:
|
||||||
|
raise AnthropicError(
|
||||||
|
message="No content in response",
|
||||||
|
status_code=response.status_code,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if len(completion_response["completion"]) > 0:
|
text_content = completion_response["content"][0].get("text", None)
|
||||||
model_response["choices"][0]["message"][
|
## TOOL CALLING - OUTPUT PARSE
|
||||||
"content"
|
if text_content is not None and "invoke" in text_content:
|
||||||
] = completion_response["completion"]
|
function_name = extract_between_tags("tool_name", text_content)[0]
|
||||||
model_response.choices[0].finish_reason = completion_response["stop_reason"]
|
function_arguments_str = extract_between_tags("invoke", text_content)[
|
||||||
|
0
|
||||||
|
].strip()
|
||||||
|
function_arguments_str = f"<invoke>{function_arguments_str}</invoke>"
|
||||||
|
function_arguments = parse_xml_params(function_arguments_str)
|
||||||
|
_message = litellm.Message(
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"id": f"call_{uuid.uuid4()}",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": function_name,
|
||||||
|
"arguments": json.dumps(function_arguments),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
content=None,
|
||||||
|
)
|
||||||
|
model_response.choices[0].message = _message # type: ignore
|
||||||
|
else:
|
||||||
|
model_response.choices[0].message.content = text_content # type: ignore
|
||||||
|
model_response.choices[0].finish_reason = map_finish_reason(
|
||||||
|
completion_response["stop_reason"]
|
||||||
|
)
|
||||||
|
|
||||||
## CALCULATING USAGE
|
## CALCULATING USAGE
|
||||||
prompt_tokens = len(
|
prompt_tokens = completion_response["usage"]["input_tokens"]
|
||||||
encoding.encode(prompt)
|
completion_tokens = completion_response["usage"]["output_tokens"]
|
||||||
) ##[TODO] use the anthropic tokenizer here
|
total_tokens = prompt_tokens + completion_tokens
|
||||||
completion_tokens = len(
|
|
||||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
|
||||||
) ##[TODO] use the anthropic tokenizer here
|
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
model_response["model"] = model
|
model_response["model"] = model
|
||||||
|
|
222
litellm/llms/anthropic_text.py
Normal file
222
litellm/llms/anthropic_text.py
Normal file
|
@ -0,0 +1,222 @@
|
||||||
|
import os, types
|
||||||
|
import json
|
||||||
|
from enum import Enum
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from typing import Callable, Optional
|
||||||
|
from litellm.utils import ModelResponse, Usage
|
||||||
|
import litellm
|
||||||
|
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
class AnthropicConstants(Enum):
|
||||||
|
HUMAN_PROMPT = "\n\nHuman: "
|
||||||
|
AI_PROMPT = "\n\nAssistant: "
|
||||||
|
|
||||||
|
|
||||||
|
class AnthropicError(Exception):
|
||||||
|
def __init__(self, status_code, message):
|
||||||
|
self.status_code = status_code
|
||||||
|
self.message = message
|
||||||
|
self.request = httpx.Request(
|
||||||
|
method="POST", url="https://api.anthropic.com/v1/complete"
|
||||||
|
)
|
||||||
|
self.response = httpx.Response(status_code=status_code, request=self.request)
|
||||||
|
super().__init__(
|
||||||
|
self.message
|
||||||
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
|
class AnthropicTextConfig:
|
||||||
|
"""
|
||||||
|
Reference: https://docs.anthropic.com/claude/reference/complete_post
|
||||||
|
|
||||||
|
to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_tokens_to_sample: Optional[int] = (
|
||||||
|
litellm.max_tokens
|
||||||
|
) # anthropic requires a default
|
||||||
|
stop_sequences: Optional[list] = None
|
||||||
|
temperature: Optional[int] = None
|
||||||
|
top_p: Optional[int] = None
|
||||||
|
top_k: Optional[int] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default
|
||||||
|
stop_sequences: Optional[list] = None,
|
||||||
|
temperature: Optional[int] = None,
|
||||||
|
top_p: Optional[int] = None,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
) -> None:
|
||||||
|
locals_ = locals()
|
||||||
|
for key, value in locals_.items():
|
||||||
|
if key != "self" and value is not None:
|
||||||
|
setattr(self.__class__, key, value)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# makes headers for API call
|
||||||
|
def validate_environment(api_key, user_headers):
|
||||||
|
if api_key is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
|
||||||
|
)
|
||||||
|
headers = {
|
||||||
|
"accept": "application/json",
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"x-api-key": api_key,
|
||||||
|
}
|
||||||
|
if user_headers is not None and isinstance(user_headers, dict):
|
||||||
|
headers = {**headers, **user_headers}
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def completion(
|
||||||
|
model: str,
|
||||||
|
messages: list,
|
||||||
|
api_base: str,
|
||||||
|
custom_prompt_dict: dict,
|
||||||
|
model_response: ModelResponse,
|
||||||
|
print_verbose: Callable,
|
||||||
|
encoding,
|
||||||
|
api_key,
|
||||||
|
logging_obj,
|
||||||
|
optional_params=None,
|
||||||
|
litellm_params=None,
|
||||||
|
logger_fn=None,
|
||||||
|
headers={},
|
||||||
|
):
|
||||||
|
headers = validate_environment(api_key, headers)
|
||||||
|
if model in custom_prompt_dict:
|
||||||
|
# check if the model has a registered custom prompt
|
||||||
|
model_prompt_details = custom_prompt_dict[model]
|
||||||
|
prompt = custom_prompt(
|
||||||
|
role_dict=model_prompt_details["roles"],
|
||||||
|
initial_prompt_value=model_prompt_details["initial_prompt_value"],
|
||||||
|
final_prompt_value=model_prompt_details["final_prompt_value"],
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt = prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||||
|
)
|
||||||
|
|
||||||
|
## Load Config
|
||||||
|
config = litellm.AnthropicTextConfig.get_config()
|
||||||
|
for k, v in config.items():
|
||||||
|
if (
|
||||||
|
k not in optional_params
|
||||||
|
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
|
optional_params[k] = v
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
**optional_params,
|
||||||
|
}
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={
|
||||||
|
"complete_input_dict": data,
|
||||||
|
"api_base": api_base,
|
||||||
|
"headers": headers,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
## COMPLETION CALL
|
||||||
|
if "stream" in optional_params and optional_params["stream"] == True:
|
||||||
|
response = requests.post(
|
||||||
|
api_base,
|
||||||
|
headers=headers,
|
||||||
|
data=json.dumps(data),
|
||||||
|
stream=optional_params["stream"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise AnthropicError(
|
||||||
|
status_code=response.status_code, message=response.text
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.iter_lines()
|
||||||
|
else:
|
||||||
|
response = requests.post(api_base, headers=headers, data=json.dumps(data))
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise AnthropicError(
|
||||||
|
status_code=response.status_code, message=response.text
|
||||||
|
)
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=prompt,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=response.text,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
)
|
||||||
|
print_verbose(f"raw model_response: {response.text}")
|
||||||
|
## RESPONSE OBJECT
|
||||||
|
try:
|
||||||
|
completion_response = response.json()
|
||||||
|
except:
|
||||||
|
raise AnthropicError(
|
||||||
|
message=response.text, status_code=response.status_code
|
||||||
|
)
|
||||||
|
if "error" in completion_response:
|
||||||
|
raise AnthropicError(
|
||||||
|
message=str(completion_response["error"]),
|
||||||
|
status_code=response.status_code,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if len(completion_response["completion"]) > 0:
|
||||||
|
model_response["choices"][0]["message"]["content"] = (
|
||||||
|
completion_response["completion"]
|
||||||
|
)
|
||||||
|
model_response.choices[0].finish_reason = completion_response["stop_reason"]
|
||||||
|
|
||||||
|
## CALCULATING USAGE
|
||||||
|
prompt_tokens = len(
|
||||||
|
encoding.encode(prompt)
|
||||||
|
) ##[TODO] use the anthropic tokenizer here
|
||||||
|
completion_tokens = len(
|
||||||
|
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||||
|
) ##[TODO] use the anthropic tokenizer here
|
||||||
|
|
||||||
|
model_response["created"] = int(time.time())
|
||||||
|
model_response["model"] = model
|
||||||
|
usage = Usage(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
|
)
|
||||||
|
model_response.usage = usage
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
|
||||||
|
def embedding():
|
||||||
|
# logic for parsing in - calling - parsing out model embedding calls
|
||||||
|
pass
|
|
@ -7,13 +7,15 @@ from litellm.utils import (
|
||||||
Message,
|
Message,
|
||||||
CustomStreamWrapper,
|
CustomStreamWrapper,
|
||||||
convert_to_model_response_object,
|
convert_to_model_response_object,
|
||||||
|
TranscriptionResponse,
|
||||||
)
|
)
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional, BinaryIO
|
||||||
from litellm import OpenAIConfig
|
from litellm import OpenAIConfig
|
||||||
import litellm, json
|
import litellm, json
|
||||||
import httpx
|
import httpx
|
||||||
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
||||||
from openai import AzureOpenAI, AsyncAzureOpenAI
|
from openai import AzureOpenAI, AsyncAzureOpenAI
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
class AzureOpenAIError(Exception):
|
class AzureOpenAIError(Exception):
|
||||||
|
@ -270,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AzureOpenAI(**azure_client_params)
|
azure_client = AzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault(
|
||||||
|
"api-version", api_version
|
||||||
|
)
|
||||||
|
|
||||||
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||||
stringified_response = response.model_dump()
|
stringified_response = response.model_dump()
|
||||||
## LOGGING
|
## LOGGING
|
||||||
|
@ -333,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client_params["api_key"] = api_key
|
azure_client_params["api_key"] = api_key
|
||||||
elif azure_ad_token is not None:
|
elif azure_ad_token is not None:
|
||||||
azure_client_params["azure_ad_token"] = azure_ad_token
|
azure_client_params["azure_ad_token"] = azure_ad_token
|
||||||
|
|
||||||
|
# setting Azure client
|
||||||
if client is None:
|
if client is None:
|
||||||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -401,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AzureOpenAI(**azure_client_params)
|
azure_client = AzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(azure_client._custom_query, dict):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -454,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -757,6 +782,158 @@ class AzureChatCompletion(BaseLLM):
|
||||||
else:
|
else:
|
||||||
raise AzureOpenAIError(status_code=500, message=str(e))
|
raise AzureOpenAIError(status_code=500, message=str(e))
|
||||||
|
|
||||||
|
def audio_transcriptions(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
audio_file: BinaryIO,
|
||||||
|
optional_params: dict,
|
||||||
|
model_response: TranscriptionResponse,
|
||||||
|
timeout: float,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
api_version: Optional[str] = None,
|
||||||
|
client=None,
|
||||||
|
azure_ad_token: Optional[str] = None,
|
||||||
|
logging_obj=None,
|
||||||
|
atranscription: bool = False,
|
||||||
|
):
|
||||||
|
data = {"model": model, "file": audio_file, **optional_params}
|
||||||
|
|
||||||
|
# init AzureOpenAI Client
|
||||||
|
azure_client_params = {
|
||||||
|
"api_version": api_version,
|
||||||
|
"azure_endpoint": api_base,
|
||||||
|
"azure_deployment": model,
|
||||||
|
"timeout": timeout,
|
||||||
|
}
|
||||||
|
|
||||||
|
max_retries = optional_params.pop("max_retries", None)
|
||||||
|
|
||||||
|
azure_client_params = select_azure_base_url_or_endpoint(
|
||||||
|
azure_client_params=azure_client_params
|
||||||
|
)
|
||||||
|
if api_key is not None:
|
||||||
|
azure_client_params["api_key"] = api_key
|
||||||
|
elif azure_ad_token is not None:
|
||||||
|
azure_client_params["azure_ad_token"] = azure_ad_token
|
||||||
|
|
||||||
|
if max_retries is not None:
|
||||||
|
azure_client_params["max_retries"] = max_retries
|
||||||
|
|
||||||
|
if atranscription == True:
|
||||||
|
return self.async_audio_transcriptions(
|
||||||
|
audio_file=audio_file,
|
||||||
|
data=data,
|
||||||
|
model_response=model_response,
|
||||||
|
timeout=timeout,
|
||||||
|
api_key=api_key,
|
||||||
|
api_base=api_base,
|
||||||
|
client=client,
|
||||||
|
azure_client_params=azure_client_params,
|
||||||
|
max_retries=max_retries,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
)
|
||||||
|
if client is None:
|
||||||
|
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
|
||||||
|
else:
|
||||||
|
azure_client = client
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=f"audio_file_{uuid.uuid4()}",
|
||||||
|
api_key=azure_client.api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
|
||||||
|
"api_base": azure_client._base_url._uri_reference,
|
||||||
|
"atranscription": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
response = azure_client.audio.transcriptions.create(
|
||||||
|
**data, timeout=timeout # type: ignore
|
||||||
|
)
|
||||||
|
stringified_response = response.model_dump()
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=audio_file.name,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
original_response=stringified_response,
|
||||||
|
)
|
||||||
|
hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
|
||||||
|
final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
|
||||||
|
return final_response
|
||||||
|
|
||||||
|
async def async_audio_transcriptions(
|
||||||
|
self,
|
||||||
|
audio_file: BinaryIO,
|
||||||
|
data: dict,
|
||||||
|
model_response: TranscriptionResponse,
|
||||||
|
timeout: float,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
client=None,
|
||||||
|
azure_client_params=None,
|
||||||
|
max_retries=None,
|
||||||
|
logging_obj=None,
|
||||||
|
):
|
||||||
|
response = None
|
||||||
|
try:
|
||||||
|
if client is None:
|
||||||
|
async_azure_client = AsyncAzureOpenAI(
|
||||||
|
**azure_client_params,
|
||||||
|
http_client=litellm.aclient_session,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
async_azure_client = client
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=f"audio_file_{uuid.uuid4()}",
|
||||||
|
api_key=async_azure_client.api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {
|
||||||
|
"Authorization": f"Bearer {async_azure_client.api_key}"
|
||||||
|
},
|
||||||
|
"api_base": async_azure_client._base_url._uri_reference,
|
||||||
|
"atranscription": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await async_azure_client.audio.transcriptions.create(
|
||||||
|
**data, timeout=timeout
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
stringified_response = response.model_dump()
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=audio_file.name,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {
|
||||||
|
"Authorization": f"Bearer {async_azure_client.api_key}"
|
||||||
|
},
|
||||||
|
"api_base": async_azure_client._base_url._uri_reference,
|
||||||
|
"atranscription": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
original_response=stringified_response,
|
||||||
|
)
|
||||||
|
hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
|
||||||
|
response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=input,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=str(e),
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
async def ahealth_check(
|
async def ahealth_check(
|
||||||
self,
|
self,
|
||||||
model: Optional[str],
|
model: Optional[str],
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
import json, copy, types
|
import json, copy, types
|
||||||
import os
|
import os
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import time
|
import time, uuid
|
||||||
from typing import Callable, Optional, Any, Union, List
|
from typing import Callable, Optional, Any, Union, List
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
|
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
|
||||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
from .prompt_templates.factory import (
|
||||||
|
prompt_factory,
|
||||||
|
custom_prompt,
|
||||||
|
construct_tool_use_system_prompt,
|
||||||
|
extract_between_tags,
|
||||||
|
parse_xml_params,
|
||||||
|
)
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,6 +76,61 @@ class AmazonTitanConfig:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonAnthropicClaude3Config:
|
||||||
|
"""
|
||||||
|
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
|
||||||
|
|
||||||
|
Supported Params for the Amazon / Anthropic Claude 3 models:
|
||||||
|
|
||||||
|
- `max_tokens` (integer) max tokens,
|
||||||
|
- `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_tokens: Optional[int] = litellm.max_tokens
|
||||||
|
anthropic_version: Optional[str] = "bedrock-2023-05-31"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
anthropic_version: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
locals_ = locals()
|
||||||
|
for key, value in locals_.items():
|
||||||
|
if key != "self" and value is not None:
|
||||||
|
setattr(self.__class__, key, value)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(self):
|
||||||
|
return ["max_tokens", "tools", "tool_choice", "stream"]
|
||||||
|
|
||||||
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param == "max_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
if param == "tools":
|
||||||
|
optional_params["tools"] = value
|
||||||
|
if param == "stream":
|
||||||
|
optional_params["stream"] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
class AmazonAnthropicConfig:
|
class AmazonAnthropicConfig:
|
||||||
"""
|
"""
|
||||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
|
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
|
||||||
|
@ -123,6 +184,25 @@ class AmazonAnthropicConfig:
|
||||||
and v is not None
|
and v is not None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(
|
||||||
|
self,
|
||||||
|
):
|
||||||
|
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||||
|
|
||||||
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param == "max_tokens":
|
||||||
|
optional_params["max_tokens_to_sample"] = value
|
||||||
|
if param == "temperature":
|
||||||
|
optional_params["temperature"] = value
|
||||||
|
if param == "top_p":
|
||||||
|
optional_params["top_p"] = value
|
||||||
|
if param == "stop":
|
||||||
|
optional_params["stop_sequences"] = value
|
||||||
|
if param == "stream" and value == True:
|
||||||
|
optional_params["stream"] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
class AmazonCohereConfig:
|
class AmazonCohereConfig:
|
||||||
"""
|
"""
|
||||||
|
@ -282,6 +362,56 @@ class AmazonLlamaConfig:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonMistralConfig:
|
||||||
|
"""
|
||||||
|
Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-mistral.html
|
||||||
|
Supported Params for the Amazon / Mistral models:
|
||||||
|
|
||||||
|
- `max_tokens` (integer) max tokens,
|
||||||
|
- `temperature` (float) temperature for model,
|
||||||
|
- `top_p` (float) top p for model
|
||||||
|
- `stop` [string] A list of stop sequences that if generated by the model, stops the model from generating further output.
|
||||||
|
- `top_k` (float) top k for model
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_tokens: Optional[int] = None
|
||||||
|
temperature: Optional[float] = None
|
||||||
|
top_p: Optional[float] = None
|
||||||
|
top_k: Optional[float] = None
|
||||||
|
stop: Optional[List[str]] = None
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[int] = None,
|
||||||
|
top_k: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
) -> None:
|
||||||
|
locals_ = locals()
|
||||||
|
for key, value in locals_.items():
|
||||||
|
if key != "self" and value is not None:
|
||||||
|
setattr(self.__class__, key, value)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class AmazonStabilityConfig:
|
class AmazonStabilityConfig:
|
||||||
"""
|
"""
|
||||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
|
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
|
||||||
|
@ -492,6 +622,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
|
||||||
prompt = prompt_factory(
|
prompt = prompt_factory(
|
||||||
model=model, messages=messages, custom_llm_provider="bedrock"
|
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||||
)
|
)
|
||||||
|
elif provider == "mistral":
|
||||||
|
prompt = prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prompt = ""
|
prompt = ""
|
||||||
for message in messages:
|
for message in messages:
|
||||||
|
@ -568,6 +702,39 @@ def completion(
|
||||||
inference_params = copy.deepcopy(optional_params)
|
inference_params = copy.deepcopy(optional_params)
|
||||||
stream = inference_params.pop("stream", False)
|
stream = inference_params.pop("stream", False)
|
||||||
if provider == "anthropic":
|
if provider == "anthropic":
|
||||||
|
if model.startswith("anthropic.claude-3"):
|
||||||
|
# Separate system prompt from rest of message
|
||||||
|
system_prompt_idx: Optional[int] = None
|
||||||
|
for idx, message in enumerate(messages):
|
||||||
|
if message["role"] == "system":
|
||||||
|
inference_params["system"] = message["content"]
|
||||||
|
system_prompt_idx = idx
|
||||||
|
break
|
||||||
|
if system_prompt_idx is not None:
|
||||||
|
messages.pop(system_prompt_idx)
|
||||||
|
# Format rest of message according to anthropic guidelines
|
||||||
|
messages = prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||||
|
)
|
||||||
|
## LOAD CONFIG
|
||||||
|
config = litellm.AmazonAnthropicClaude3Config.get_config()
|
||||||
|
for k, v in config.items():
|
||||||
|
if (
|
||||||
|
k not in inference_params
|
||||||
|
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
|
inference_params[k] = v
|
||||||
|
## Handle Tool Calling
|
||||||
|
if "tools" in inference_params:
|
||||||
|
tool_calling_system_prompt = construct_tool_use_system_prompt(
|
||||||
|
tools=inference_params["tools"]
|
||||||
|
)
|
||||||
|
inference_params["system"] = (
|
||||||
|
inference_params.get("system", "\n")
|
||||||
|
+ tool_calling_system_prompt
|
||||||
|
) # add the anthropic tool calling prompt to the system prompt
|
||||||
|
inference_params.pop("tools")
|
||||||
|
data = json.dumps({"messages": messages, **inference_params})
|
||||||
|
else:
|
||||||
## LOAD CONFIG
|
## LOAD CONFIG
|
||||||
config = litellm.AmazonAnthropicConfig.get_config()
|
config = litellm.AmazonAnthropicConfig.get_config()
|
||||||
for k, v in config.items():
|
for k, v in config.items():
|
||||||
|
@ -595,9 +762,9 @@ def completion(
|
||||||
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
inference_params[k] = v
|
inference_params[k] = v
|
||||||
if optional_params.get("stream", False) == True:
|
if optional_params.get("stream", False) == True:
|
||||||
inference_params[
|
inference_params["stream"] = (
|
||||||
"stream"
|
True # cohere requires stream = True in inference params
|
||||||
] = True # cohere requires stream = True in inference params
|
)
|
||||||
data = json.dumps({"prompt": prompt, **inference_params})
|
data = json.dumps({"prompt": prompt, **inference_params})
|
||||||
elif provider == "meta":
|
elif provider == "meta":
|
||||||
## LOAD CONFIG
|
## LOAD CONFIG
|
||||||
|
@ -623,7 +790,16 @@ def completion(
|
||||||
"textGenerationConfig": inference_params,
|
"textGenerationConfig": inference_params,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
elif provider == "mistral":
|
||||||
|
## LOAD CONFIG
|
||||||
|
config = litellm.AmazonMistralConfig.get_config()
|
||||||
|
for k, v in config.items():
|
||||||
|
if (
|
||||||
|
k not in inference_params
|
||||||
|
): # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
|
inference_params[k] = v
|
||||||
|
|
||||||
|
data = json.dumps({"prompt": prompt, **inference_params})
|
||||||
else:
|
else:
|
||||||
data = json.dumps({})
|
data = json.dumps({})
|
||||||
|
|
||||||
|
@ -723,12 +899,49 @@ def completion(
|
||||||
if provider == "ai21":
|
if provider == "ai21":
|
||||||
outputText = response_body.get("completions")[0].get("data").get("text")
|
outputText = response_body.get("completions")[0].get("data").get("text")
|
||||||
elif provider == "anthropic":
|
elif provider == "anthropic":
|
||||||
|
if model.startswith("anthropic.claude-3"):
|
||||||
|
outputText = response_body.get("content")[0].get("text", None)
|
||||||
|
if "<invoke>" in outputText: # OUTPUT PARSE FUNCTION CALL
|
||||||
|
function_name = extract_between_tags("tool_name", outputText)[0]
|
||||||
|
function_arguments_str = extract_between_tags("invoke", outputText)[
|
||||||
|
0
|
||||||
|
].strip()
|
||||||
|
function_arguments_str = (
|
||||||
|
f"<invoke>{function_arguments_str}</invoke>"
|
||||||
|
)
|
||||||
|
function_arguments = parse_xml_params(function_arguments_str)
|
||||||
|
_message = litellm.Message(
|
||||||
|
tool_calls=[
|
||||||
|
{
|
||||||
|
"id": f"call_{uuid.uuid4()}",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": function_name,
|
||||||
|
"arguments": json.dumps(function_arguments),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
content=None,
|
||||||
|
)
|
||||||
|
model_response.choices[0].message = _message # type: ignore
|
||||||
|
model_response["finish_reason"] = response_body["stop_reason"]
|
||||||
|
_usage = litellm.Usage(
|
||||||
|
prompt_tokens=response_body["usage"]["input_tokens"],
|
||||||
|
completion_tokens=response_body["usage"]["output_tokens"],
|
||||||
|
total_tokens=response_body["usage"]["input_tokens"]
|
||||||
|
+ response_body["usage"]["output_tokens"],
|
||||||
|
)
|
||||||
|
model_response.usage = _usage
|
||||||
|
else:
|
||||||
outputText = response_body["completion"]
|
outputText = response_body["completion"]
|
||||||
model_response["finish_reason"] = response_body["stop_reason"]
|
model_response["finish_reason"] = response_body["stop_reason"]
|
||||||
elif provider == "cohere":
|
elif provider == "cohere":
|
||||||
outputText = response_body["generations"][0]["text"]
|
outputText = response_body["generations"][0]["text"]
|
||||||
elif provider == "meta":
|
elif provider == "meta":
|
||||||
outputText = response_body["generation"]
|
outputText = response_body["generation"]
|
||||||
|
elif provider == "mistral":
|
||||||
|
outputText = response_body["outputs"][0]["text"]
|
||||||
|
model_response["finish_reason"] = response_body["outputs"][0]["stop_reason"]
|
||||||
else: # amazon titan
|
else: # amazon titan
|
||||||
outputText = response_body.get("results")[0].get("outputText")
|
outputText = response_body.get("results")[0].get("outputText")
|
||||||
|
|
||||||
|
@ -740,8 +953,19 @@ def completion(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
if len(outputText) > 0:
|
if (
|
||||||
|
len(outputText) > 0
|
||||||
|
and hasattr(model_response.choices[0], "message")
|
||||||
|
and getattr(model_response.choices[0].message, "tool_calls", None)
|
||||||
|
is None
|
||||||
|
):
|
||||||
model_response["choices"][0]["message"]["content"] = outputText
|
model_response["choices"][0]["message"]["content"] = outputText
|
||||||
|
elif (
|
||||||
|
hasattr(model_response.choices[0], "message")
|
||||||
|
and getattr(model_response.choices[0].message, "tool_calls", None)
|
||||||
|
is not None
|
||||||
|
):
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
raise Exception()
|
raise Exception()
|
||||||
except:
|
except:
|
||||||
|
@ -751,6 +975,7 @@ def completion(
|
||||||
)
|
)
|
||||||
|
|
||||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
|
if getattr(model_response.usage, "total_tokens", None) is None:
|
||||||
prompt_tokens = response_metadata.get(
|
prompt_tokens = response_metadata.get(
|
||||||
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||||
)
|
)
|
||||||
|
@ -762,15 +987,16 @@ def completion(
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
|
||||||
model_response["model"] = model
|
|
||||||
usage = Usage(
|
usage = Usage(
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
)
|
)
|
||||||
model_response.usage = usage
|
model_response.usage = usage
|
||||||
|
|
||||||
|
model_response["created"] = int(time.time())
|
||||||
|
model_response["model"] = model
|
||||||
|
|
||||||
model_response._hidden_params["region_name"] = client.meta.region_name
|
model_response._hidden_params["region_name"] = client.meta.region_name
|
||||||
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
||||||
return model_response
|
return model_response
|
||||||
|
|
|
@ -634,8 +634,43 @@ class Huggingface(BaseLLM):
|
||||||
status_code=r.status_code,
|
status_code=r.status_code,
|
||||||
message=str(text),
|
message=str(text),
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
|
Check first chunk for error message.
|
||||||
|
If error message, raise error.
|
||||||
|
If not - add back to stream
|
||||||
|
"""
|
||||||
|
# Async iterator over the lines in the response body
|
||||||
|
response_iterator = r.aiter_lines()
|
||||||
|
|
||||||
|
# Attempt to get the first line/chunk from the response
|
||||||
|
try:
|
||||||
|
first_chunk = await response_iterator.__anext__()
|
||||||
|
except StopAsyncIteration:
|
||||||
|
# Handle the case where there are no lines to read (empty response)
|
||||||
|
first_chunk = ""
|
||||||
|
|
||||||
|
# Check the first chunk for an error message
|
||||||
|
if (
|
||||||
|
"error" in first_chunk.lower()
|
||||||
|
): # Adjust this condition based on how error messages are structured
|
||||||
|
raise HuggingfaceError(
|
||||||
|
status_code=400,
|
||||||
|
message=first_chunk,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a new async generator that begins with the first_chunk and includes the remaining items
|
||||||
|
async def custom_stream_with_first_chunk():
|
||||||
|
yield first_chunk # Yield back the first chunk
|
||||||
|
async for (
|
||||||
|
chunk
|
||||||
|
) in response_iterator: # Continue yielding the rest of the chunks
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
# Creating a new completion stream that starts with the first chunk
|
||||||
|
completion_stream = custom_stream_with_first_chunk()
|
||||||
|
|
||||||
streamwrapper = CustomStreamWrapper(
|
streamwrapper = CustomStreamWrapper(
|
||||||
completion_stream=r.aiter_lines(),
|
completion_stream=completion_stream,
|
||||||
model=model,
|
model=model,
|
||||||
custom_llm_provider="huggingface",
|
custom_llm_provider="huggingface",
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
|
|
|
@ -18,7 +18,7 @@ class OllamaError(Exception):
|
||||||
) # Call the base class constructor with the parameters it needs
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
class OllamaConfig:
|
class OllamaChatConfig:
|
||||||
"""
|
"""
|
||||||
Reference: https://github.com/jmorganca/ollama/blob/main/docs/api.md#parameters
|
Reference: https://github.com/jmorganca/ollama/blob/main/docs/api.md#parameters
|
||||||
|
|
||||||
|
@ -108,6 +108,7 @@ class OllamaConfig:
|
||||||
k: v
|
k: v
|
||||||
for k, v in cls.__dict__.items()
|
for k, v in cls.__dict__.items()
|
||||||
if not k.startswith("__")
|
if not k.startswith("__")
|
||||||
|
and k != "function_name" # special param for function calling
|
||||||
and not isinstance(
|
and not isinstance(
|
||||||
v,
|
v,
|
||||||
(
|
(
|
||||||
|
@ -120,6 +121,61 @@ class OllamaConfig:
|
||||||
and v is not None
|
and v is not None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(
|
||||||
|
self,
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
"max_tokens",
|
||||||
|
"stream",
|
||||||
|
"top_p",
|
||||||
|
"temperature",
|
||||||
|
"frequency_penalty",
|
||||||
|
"stop",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
"functions",
|
||||||
|
]
|
||||||
|
|
||||||
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param == "max_tokens":
|
||||||
|
optional_params["num_predict"] = value
|
||||||
|
if param == "stream":
|
||||||
|
optional_params["stream"] = value
|
||||||
|
if param == "temperature":
|
||||||
|
optional_params["temperature"] = value
|
||||||
|
if param == "top_p":
|
||||||
|
optional_params["top_p"] = value
|
||||||
|
if param == "frequency_penalty":
|
||||||
|
optional_params["repeat_penalty"] = param
|
||||||
|
if param == "stop":
|
||||||
|
optional_params["stop"] = value
|
||||||
|
### FUNCTION CALLING LOGIC ###
|
||||||
|
if param == "tools":
|
||||||
|
# ollama actually supports json output
|
||||||
|
optional_params["format"] = "json"
|
||||||
|
litellm.add_function_to_prompt = (
|
||||||
|
True # so that main.py adds the function call to the prompt
|
||||||
|
)
|
||||||
|
optional_params["functions_unsupported_model"] = value
|
||||||
|
|
||||||
|
if len(optional_params["functions_unsupported_model"]) == 1:
|
||||||
|
optional_params["function_name"] = optional_params[
|
||||||
|
"functions_unsupported_model"
|
||||||
|
][0]["function"]["name"]
|
||||||
|
|
||||||
|
if param == "functions":
|
||||||
|
# ollama actually supports json output
|
||||||
|
optional_params["format"] = "json"
|
||||||
|
litellm.add_function_to_prompt = (
|
||||||
|
True # so that main.py adds the function call to the prompt
|
||||||
|
)
|
||||||
|
optional_params["functions_unsupported_model"] = non_default_params.pop(
|
||||||
|
"functions"
|
||||||
|
)
|
||||||
|
non_default_params.pop("tool_choice", None) # causes ollama requests to hang
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
# ollama implementation
|
# ollama implementation
|
||||||
def get_ollama_response(
|
def get_ollama_response(
|
||||||
|
@ -138,7 +194,7 @@ def get_ollama_response(
|
||||||
url = f"{api_base}/api/chat"
|
url = f"{api_base}/api/chat"
|
||||||
|
|
||||||
## Load Config
|
## Load Config
|
||||||
config = litellm.OllamaConfig.get_config()
|
config = litellm.OllamaChatConfig.get_config()
|
||||||
for k, v in config.items():
|
for k, v in config.items():
|
||||||
if (
|
if (
|
||||||
k not in optional_params
|
k not in optional_params
|
||||||
|
@ -147,6 +203,7 @@ def get_ollama_response(
|
||||||
|
|
||||||
stream = optional_params.pop("stream", False)
|
stream = optional_params.pop("stream", False)
|
||||||
format = optional_params.pop("format", None)
|
format = optional_params.pop("format", None)
|
||||||
|
function_name = optional_params.pop("function_name", None)
|
||||||
|
|
||||||
for m in messages:
|
for m in messages:
|
||||||
if "role" in m and m["role"] == "tool":
|
if "role" in m and m["role"] == "tool":
|
||||||
|
@ -187,6 +244,7 @@ def get_ollama_response(
|
||||||
model_response=model_response,
|
model_response=model_response,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
logging_obj=logging_obj,
|
logging_obj=logging_obj,
|
||||||
|
function_name=function_name,
|
||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
elif stream == True:
|
elif stream == True:
|
||||||
|
@ -290,7 +348,9 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
async def ollama_acompletion(
|
||||||
|
url, data, model_response, encoding, logging_obj, function_name
|
||||||
|
):
|
||||||
data["stream"] = False
|
data["stream"] = False
|
||||||
try:
|
try:
|
||||||
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
|
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
|
||||||
|
@ -324,7 +384,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
||||||
"id": f"call_{str(uuid.uuid4())}",
|
"id": f"call_{str(uuid.uuid4())}",
|
||||||
"function": {
|
"function": {
|
||||||
"arguments": response_json["message"]["content"],
|
"arguments": response_json["message"]["content"],
|
||||||
"name": "",
|
"name": function_name or "",
|
||||||
},
|
},
|
||||||
"type": "function",
|
"type": "function",
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Union, Any
|
from typing import Optional, Union, Any, BinaryIO
|
||||||
import types, time, json, traceback
|
import types, time, json, traceback
|
||||||
import httpx
|
import httpx
|
||||||
from .base import BaseLLM
|
from .base import BaseLLM
|
||||||
|
@ -9,6 +9,7 @@ from litellm.utils import (
|
||||||
CustomStreamWrapper,
|
CustomStreamWrapper,
|
||||||
convert_to_model_response_object,
|
convert_to_model_response_object,
|
||||||
Usage,
|
Usage,
|
||||||
|
TranscriptionResponse,
|
||||||
)
|
)
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
import aiohttp, requests
|
import aiohttp, requests
|
||||||
|
@ -237,6 +238,8 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
status_code=422, message=f"Timeout needs to be a float"
|
status_code=422, message=f"Timeout needs to be a float"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if custom_llm_provider != "openai":
|
||||||
|
# process all OpenAI compatible provider logic here
|
||||||
if custom_llm_provider == "mistral":
|
if custom_llm_provider == "mistral":
|
||||||
# check if message content passed in as list, and not string
|
# check if message content passed in as list, and not string
|
||||||
messages = prompt_factory(
|
messages = prompt_factory(
|
||||||
|
@ -244,7 +247,13 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
messages=messages,
|
messages=messages,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
)
|
)
|
||||||
|
if custom_llm_provider == "perplexity" and messages is not None:
|
||||||
|
# check if messages.name is passed + supported, if not supported remove
|
||||||
|
messages = prompt_factory(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
)
|
||||||
for _ in range(
|
for _ in range(
|
||||||
2
|
2
|
||||||
): # if call fails due to alternating messages, retry with reformatted message
|
): # if call fails due to alternating messages, retry with reformatted message
|
||||||
|
@ -744,6 +753,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
# return response
|
# return response
|
||||||
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||||
except OpenAIError as e:
|
except OpenAIError as e:
|
||||||
|
|
||||||
exception_mapping_worked = True
|
exception_mapping_worked = True
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
|
@ -766,6 +776,105 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
else:
|
else:
|
||||||
raise OpenAIError(status_code=500, message=str(e))
|
raise OpenAIError(status_code=500, message=str(e))
|
||||||
|
|
||||||
|
def audio_transcriptions(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
audio_file: BinaryIO,
|
||||||
|
optional_params: dict,
|
||||||
|
model_response: TranscriptionResponse,
|
||||||
|
timeout: float,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
client=None,
|
||||||
|
max_retries=None,
|
||||||
|
logging_obj=None,
|
||||||
|
atranscription: bool = False,
|
||||||
|
):
|
||||||
|
data = {"model": model, "file": audio_file, **optional_params}
|
||||||
|
if atranscription == True:
|
||||||
|
return self.async_audio_transcriptions(
|
||||||
|
audio_file=audio_file,
|
||||||
|
data=data,
|
||||||
|
model_response=model_response,
|
||||||
|
timeout=timeout,
|
||||||
|
api_key=api_key,
|
||||||
|
api_base=api_base,
|
||||||
|
client=client,
|
||||||
|
max_retries=max_retries,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
)
|
||||||
|
if client is None:
|
||||||
|
openai_client = OpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=api_base,
|
||||||
|
http_client=litellm.client_session,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
openai_client = client
|
||||||
|
response = openai_client.audio.transcriptions.create(
|
||||||
|
**data, timeout=timeout # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
stringified_response = response.model_dump()
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=audio_file.name,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
original_response=stringified_response,
|
||||||
|
)
|
||||||
|
hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
|
||||||
|
final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
|
||||||
|
return final_response
|
||||||
|
|
||||||
|
async def async_audio_transcriptions(
|
||||||
|
self,
|
||||||
|
audio_file: BinaryIO,
|
||||||
|
data: dict,
|
||||||
|
model_response: TranscriptionResponse,
|
||||||
|
timeout: float,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
client=None,
|
||||||
|
max_retries=None,
|
||||||
|
logging_obj=None,
|
||||||
|
):
|
||||||
|
response = None
|
||||||
|
try:
|
||||||
|
if client is None:
|
||||||
|
openai_aclient = AsyncOpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=api_base,
|
||||||
|
http_client=litellm.aclient_session,
|
||||||
|
timeout=timeout,
|
||||||
|
max_retries=max_retries,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
openai_aclient = client
|
||||||
|
response = await openai_aclient.audio.transcriptions.create(
|
||||||
|
**data, timeout=timeout
|
||||||
|
) # type: ignore
|
||||||
|
stringified_response = response.model_dump()
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=audio_file.name,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data},
|
||||||
|
original_response=stringified_response,
|
||||||
|
)
|
||||||
|
hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
|
||||||
|
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
|
||||||
|
except Exception as e:
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=input,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=str(e),
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
async def ahealth_check(
|
async def ahealth_check(
|
||||||
self,
|
self,
|
||||||
model: Optional[str],
|
model: Optional[str],
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import requests, traceback
|
import requests, traceback
|
||||||
import json
|
import json, re, xml.etree.ElementTree as ET
|
||||||
from jinja2 import Template, exceptions, Environment, meta
|
from jinja2 import Template, exceptions, Environment, meta
|
||||||
from typing import Optional, Any
|
from typing import Optional, Any
|
||||||
|
import imghdr, base64
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
def default_pt(messages):
|
def default_pt(messages):
|
||||||
|
@ -110,9 +112,9 @@ def mistral_instruct_pt(messages):
|
||||||
"post_message": " [/INST]\n",
|
"post_message": " [/INST]\n",
|
||||||
},
|
},
|
||||||
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
|
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
|
||||||
"assistant": {"pre_message": " ", "post_message": " "},
|
"assistant": {"pre_message": " ", "post_message": "</s> "},
|
||||||
},
|
},
|
||||||
final_prompt_value="</s>",
|
final_prompt_value="",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
)
|
)
|
||||||
return prompt
|
return prompt
|
||||||
|
@ -390,7 +392,7 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
###
|
### ANTHROPIC ###
|
||||||
|
|
||||||
|
|
||||||
def anthropic_pt(
|
def anthropic_pt(
|
||||||
|
@ -424,6 +426,232 @@ def anthropic_pt(
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def construct_format_parameters_prompt(parameters: dict):
|
||||||
|
parameter_str = "<parameter>\n"
|
||||||
|
for k, v in parameters.items():
|
||||||
|
parameter_str += f"<{k}>"
|
||||||
|
parameter_str += f"{v}"
|
||||||
|
parameter_str += f"</{k}>"
|
||||||
|
parameter_str += "\n</parameter>"
|
||||||
|
return parameter_str
|
||||||
|
|
||||||
|
|
||||||
|
def construct_format_tool_for_claude_prompt(name, description, parameters):
|
||||||
|
constructed_prompt = (
|
||||||
|
"<tool_description>\n"
|
||||||
|
f"<tool_name>{name}</tool_name>\n"
|
||||||
|
"<description>\n"
|
||||||
|
f"{description}\n"
|
||||||
|
"</description>\n"
|
||||||
|
"<parameters>\n"
|
||||||
|
f"{construct_format_parameters_prompt(parameters)}\n"
|
||||||
|
"</parameters>\n"
|
||||||
|
"</tool_description>"
|
||||||
|
)
|
||||||
|
return constructed_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def construct_tool_use_system_prompt(
|
||||||
|
tools,
|
||||||
|
): # from https://github.com/anthropics/anthropic-cookbook/blob/main/function_calling/function_calling.ipynb
|
||||||
|
tool_str_list = []
|
||||||
|
for tool in tools:
|
||||||
|
tool_str = construct_format_tool_for_claude_prompt(
|
||||||
|
tool["function"]["name"],
|
||||||
|
tool["function"].get("description", ""),
|
||||||
|
tool["function"].get("parameters", {}),
|
||||||
|
)
|
||||||
|
tool_str_list.append(tool_str)
|
||||||
|
tool_use_system_prompt = (
|
||||||
|
"In this environment you have access to a set of tools you can use to answer the user's question.\n"
|
||||||
|
"\n"
|
||||||
|
"You may call them like this:\n"
|
||||||
|
"<function_calls>\n"
|
||||||
|
"<invoke>\n"
|
||||||
|
"<tool_name>$TOOL_NAME</tool_name>\n"
|
||||||
|
"<parameters>\n"
|
||||||
|
"<$PARAMETER_NAME>$PARAMETER_VALUE</$PARAMETER_NAME>\n"
|
||||||
|
"...\n"
|
||||||
|
"</parameters>\n"
|
||||||
|
"</invoke>\n"
|
||||||
|
"</function_calls>\n"
|
||||||
|
"\n"
|
||||||
|
"Here are the tools available:\n"
|
||||||
|
"<tools>\n" + "\n".join([tool_str for tool_str in tool_str_list]) + "\n</tools>"
|
||||||
|
)
|
||||||
|
return tool_use_system_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def convert_url_to_base64(url):
|
||||||
|
import requests
|
||||||
|
import base64
|
||||||
|
|
||||||
|
for _ in range(3):
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if response.status_code == 200:
|
||||||
|
image_bytes = response.content
|
||||||
|
base64_image = base64.b64encode(image_bytes).decode("utf-8")
|
||||||
|
|
||||||
|
img_type = url.split(".")[-1].lower()
|
||||||
|
if img_type == "jpg" or img_type == "jpeg":
|
||||||
|
img_type = "image/jpeg"
|
||||||
|
elif img_type == "png":
|
||||||
|
img_type = "image/png"
|
||||||
|
elif img_type == "gif":
|
||||||
|
img_type = "image/gif"
|
||||||
|
elif img_type == "webp":
|
||||||
|
img_type = "image/webp"
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
f"Error: Unsupported image format. Format={img_type}. Supported types = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']"
|
||||||
|
)
|
||||||
|
|
||||||
|
return f"data:{img_type};base64,{base64_image}"
|
||||||
|
else:
|
||||||
|
raise Exception(f"Error: Unable to fetch image from URL. url={url}")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_anthropic_image_obj(openai_image_url: str):
|
||||||
|
"""
|
||||||
|
Input:
|
||||||
|
"image_url": "data:image/jpeg;base64,{base64_image}",
|
||||||
|
|
||||||
|
Return:
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/jpeg",
|
||||||
|
"data": {base64_image},
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if openai_image_url.startswith("http"):
|
||||||
|
openai_image_url = convert_url_to_base64(url=openai_image_url)
|
||||||
|
# Extract the base64 image data
|
||||||
|
base64_data = openai_image_url.split("data:image/")[1].split(";base64,")[1]
|
||||||
|
|
||||||
|
# Infer image format from the URL
|
||||||
|
image_format = openai_image_url.split("data:image/")[1].split(";base64,")[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": f"image/{image_format}",
|
||||||
|
"data": base64_data,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
if "Error: Unable to fetch image from URL" in str(e):
|
||||||
|
raise e
|
||||||
|
raise Exception(
|
||||||
|
"""Image url not in expected format. Example Expected input - "image_url": "data:image/jpeg;base64,{base64_image}". Supported formats - ['image/jpeg', 'image/png', 'image/gif', 'image/webp'] """
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def anthropic_messages_pt(messages: list):
|
||||||
|
"""
|
||||||
|
format messages for anthropic
|
||||||
|
1. Anthropic supports roles like "user" and "assistant", (here litellm translates system-> assistant)
|
||||||
|
2. The first message always needs to be of role "user"
|
||||||
|
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
|
||||||
|
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
|
||||||
|
5. System messages are a separate param to the Messages API (used for tool calling)
|
||||||
|
6. Ensure we only accept role, content. (message.name is not supported)
|
||||||
|
"""
|
||||||
|
## Ensure final assistant message has no trailing whitespace
|
||||||
|
last_assistant_message_idx: Optional[int] = None
|
||||||
|
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
|
||||||
|
new_messages = []
|
||||||
|
if len(messages) == 1:
|
||||||
|
# check if the message is a user message
|
||||||
|
if messages[0]["role"] == "assistant":
|
||||||
|
new_messages.append({"role": "user", "content": ""})
|
||||||
|
|
||||||
|
# check if content is a list (vision)
|
||||||
|
if isinstance(messages[0]["content"], list): # vision input
|
||||||
|
new_content = []
|
||||||
|
for m in messages[0]["content"]:
|
||||||
|
if m.get("type", "") == "image_url":
|
||||||
|
new_content.append(
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": convert_to_anthropic_image_obj(
|
||||||
|
m["image_url"]["url"]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif m.get("type", "") == "text":
|
||||||
|
new_content.append({"type": "text", "text": m["text"]})
|
||||||
|
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
|
||||||
|
else:
|
||||||
|
new_messages.append(
|
||||||
|
{"role": messages[0]["role"], "content": messages[0]["content"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
return new_messages
|
||||||
|
|
||||||
|
for i in range(len(messages) - 1): # type: ignore
|
||||||
|
if i == 0 and messages[i]["role"] == "assistant":
|
||||||
|
new_messages.append({"role": "user", "content": ""})
|
||||||
|
if isinstance(messages[i]["content"], list): # vision input
|
||||||
|
new_content = []
|
||||||
|
for m in messages[i]["content"]:
|
||||||
|
if m.get("type", "") == "image_url":
|
||||||
|
new_content.append(
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": convert_to_anthropic_image_obj(
|
||||||
|
m["image_url"]["url"]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif m.get("type", "") == "text":
|
||||||
|
new_content.append({"type": "text", "content": m["text"]})
|
||||||
|
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
|
||||||
|
else:
|
||||||
|
new_messages.append(
|
||||||
|
{"role": messages[i]["role"], "content": messages[i]["content"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
if messages[i]["role"] == messages[i + 1]["role"]:
|
||||||
|
if messages[i]["role"] == "user":
|
||||||
|
new_messages.append({"role": "assistant", "content": ""})
|
||||||
|
else:
|
||||||
|
new_messages.append({"role": "user", "content": ""})
|
||||||
|
|
||||||
|
if messages[i]["role"] == "assistant":
|
||||||
|
last_assistant_message_idx = i
|
||||||
|
|
||||||
|
new_messages.append(messages[-1])
|
||||||
|
if last_assistant_message_idx is not None:
|
||||||
|
new_messages[last_assistant_message_idx]["content"] = new_messages[
|
||||||
|
last_assistant_message_idx
|
||||||
|
][
|
||||||
|
"content"
|
||||||
|
].strip() # no trailing whitespace for final assistant message
|
||||||
|
|
||||||
|
return new_messages
|
||||||
|
|
||||||
|
|
||||||
|
def extract_between_tags(tag: str, string: str, strip: bool = False) -> List[str]:
|
||||||
|
ext_list = re.findall(f"<{tag}>(.+?)</{tag}>", string, re.DOTALL)
|
||||||
|
if strip:
|
||||||
|
ext_list = [e.strip() for e in ext_list]
|
||||||
|
return ext_list
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xml_params(xml_content):
|
||||||
|
root = ET.fromstring(xml_content)
|
||||||
|
params = {}
|
||||||
|
for child in root.findall(".//parameters/*"):
|
||||||
|
params[child.tag] = child.text
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
|
||||||
|
|
||||||
def amazon_titan_pt(
|
def amazon_titan_pt(
|
||||||
messages: list,
|
messages: list,
|
||||||
): # format - https://github.com/BerriAI/litellm/issues/1896
|
): # format - https://github.com/BerriAI/litellm/issues/1896
|
||||||
|
@ -650,10 +878,9 @@ def prompt_factory(
|
||||||
if custom_llm_provider == "ollama":
|
if custom_llm_provider == "ollama":
|
||||||
return ollama_pt(model=model, messages=messages)
|
return ollama_pt(model=model, messages=messages)
|
||||||
elif custom_llm_provider == "anthropic":
|
elif custom_llm_provider == "anthropic":
|
||||||
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
|
if model == "claude-instant-1" or model == "claude-2":
|
||||||
return claude_2_1_pt(messages=messages)
|
|
||||||
else:
|
|
||||||
return anthropic_pt(messages=messages)
|
return anthropic_pt(messages=messages)
|
||||||
|
return anthropic_messages_pt(messages=messages)
|
||||||
elif custom_llm_provider == "together_ai":
|
elif custom_llm_provider == "together_ai":
|
||||||
prompt_format, chat_template = get_model_info(token=api_key, model=model)
|
prompt_format, chat_template = get_model_info(token=api_key, model=model)
|
||||||
return format_prompt_togetherai(
|
return format_prompt_togetherai(
|
||||||
|
@ -674,6 +901,12 @@ def prompt_factory(
|
||||||
return claude_2_1_pt(messages=messages)
|
return claude_2_1_pt(messages=messages)
|
||||||
else:
|
else:
|
||||||
return anthropic_pt(messages=messages)
|
return anthropic_pt(messages=messages)
|
||||||
|
elif "mistral." in model:
|
||||||
|
return mistral_instruct_pt(messages=messages)
|
||||||
|
elif custom_llm_provider == "perplexity":
|
||||||
|
for message in messages:
|
||||||
|
message.pop("name", None)
|
||||||
|
return messages
|
||||||
try:
|
try:
|
||||||
if "meta-llama/llama-2" in model and "chat" in model:
|
if "meta-llama/llama-2" in model and "chat" in model:
|
||||||
return llama_2_chat_pt(messages=messages)
|
return llama_2_chat_pt(messages=messages)
|
||||||
|
|
|
@ -104,7 +104,8 @@ def start_prediction(
|
||||||
version_id = version_id.replace("deployments/", "")
|
version_id = version_id.replace("deployments/", "")
|
||||||
base_url = f"https://api.replicate.com/v1/deployments/{version_id}"
|
base_url = f"https://api.replicate.com/v1/deployments/{version_id}"
|
||||||
print_verbose(f"Deployment base URL: {base_url}\n")
|
print_verbose(f"Deployment base URL: {base_url}\n")
|
||||||
|
else: # assume it's a model
|
||||||
|
base_url = f"https://api.replicate.com/v1/models/{version_id}"
|
||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Token {api_token}",
|
"Authorization": f"Token {api_token}",
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
@ -306,9 +307,9 @@ def completion(
|
||||||
result, logs = handle_prediction_response(
|
result, logs = handle_prediction_response(
|
||||||
prediction_url, api_key, print_verbose
|
prediction_url, api_key, print_verbose
|
||||||
)
|
)
|
||||||
model_response[
|
model_response["ended"] = (
|
||||||
"ended"
|
time.time()
|
||||||
] = time.time() # for pricing this must remain right after calling api
|
) # for pricing this must remain right after calling api
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
input=prompt,
|
input=prompt,
|
||||||
|
|
|
@ -1047,6 +1047,7 @@ def embedding(
|
||||||
vertex_project=None,
|
vertex_project=None,
|
||||||
vertex_location=None,
|
vertex_location=None,
|
||||||
aembedding=False,
|
aembedding=False,
|
||||||
|
print_verbose=None,
|
||||||
):
|
):
|
||||||
# logic for parsing in - calling - parsing out model embedding calls
|
# logic for parsing in - calling - parsing out model embedding calls
|
||||||
try:
|
try:
|
||||||
|
@ -1062,7 +1063,13 @@ def embedding(
|
||||||
|
|
||||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||||
try:
|
try:
|
||||||
|
print_verbose(
|
||||||
|
f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}"
|
||||||
|
)
|
||||||
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||||
|
print_verbose(
|
||||||
|
f"VERTEX AI: creds={creds}; google application credentials: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}"
|
||||||
|
)
|
||||||
vertexai.init(
|
vertexai.init(
|
||||||
project=vertex_project, location=vertex_location, credentials=creds
|
project=vertex_project, location=vertex_location, credentials=creds
|
||||||
)
|
)
|
||||||
|
|
189
litellm/main.py
189
litellm/main.py
|
@ -8,10 +8,11 @@
|
||||||
# Thank you ! We ❤️ you! - Krrish & Ishaan
|
# Thank you ! We ❤️ you! - Krrish & Ishaan
|
||||||
|
|
||||||
import os, openai, sys, json, inspect, uuid, datetime, threading
|
import os, openai, sys, json, inspect, uuid, datetime, threading
|
||||||
from typing import Any, Literal, Union
|
from typing import Any, Literal, Union, BinaryIO
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import dotenv, traceback, random, asyncio, time, contextvars
|
import dotenv, traceback, random, asyncio, time, contextvars
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import litellm
|
import litellm
|
||||||
from ._logging import verbose_logger
|
from ._logging import verbose_logger
|
||||||
|
@ -39,6 +40,7 @@ from litellm.utils import (
|
||||||
)
|
)
|
||||||
from .llms import (
|
from .llms import (
|
||||||
anthropic,
|
anthropic,
|
||||||
|
anthropic_text,
|
||||||
together_ai,
|
together_ai,
|
||||||
ai21,
|
ai21,
|
||||||
sagemaker,
|
sagemaker,
|
||||||
|
@ -87,6 +89,7 @@ from litellm.utils import (
|
||||||
read_config_args,
|
read_config_args,
|
||||||
Choices,
|
Choices,
|
||||||
Message,
|
Message,
|
||||||
|
TranscriptionResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
####### ENVIRONMENT VARIABLES ###################
|
####### ENVIRONMENT VARIABLES ###################
|
||||||
|
@ -486,6 +489,8 @@ def completion(
|
||||||
### ASYNC CALLS ###
|
### ASYNC CALLS ###
|
||||||
acompletion = kwargs.get("acompletion", False)
|
acompletion = kwargs.get("acompletion", False)
|
||||||
client = kwargs.get("client", None)
|
client = kwargs.get("client", None)
|
||||||
|
### Admin Controls ###
|
||||||
|
no_log = kwargs.get("no-log", False)
|
||||||
######## end of unpacking kwargs ###########
|
######## end of unpacking kwargs ###########
|
||||||
openai_params = [
|
openai_params = [
|
||||||
"functions",
|
"functions",
|
||||||
|
@ -561,7 +566,8 @@ def completion(
|
||||||
"preset_cache_key",
|
"preset_cache_key",
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
"cache"
|
"cache",
|
||||||
|
"no-log",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
@ -725,6 +731,7 @@ def completion(
|
||||||
model_info=model_info,
|
model_info=model_info,
|
||||||
proxy_server_request=proxy_server_request,
|
proxy_server_request=proxy_server_request,
|
||||||
preset_cache_key=preset_cache_key,
|
preset_cache_key=preset_cache_key,
|
||||||
|
no_log=no_log,
|
||||||
)
|
)
|
||||||
logging.update_environment_variables(
|
logging.update_environment_variables(
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -1018,13 +1025,40 @@ def completion(
|
||||||
or litellm.api_key
|
or litellm.api_key
|
||||||
or os.environ.get("ANTHROPIC_API_KEY")
|
or os.environ.get("ANTHROPIC_API_KEY")
|
||||||
)
|
)
|
||||||
|
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
|
||||||
|
|
||||||
|
if (model == "claude-2") or (model == "claude-instant-1"):
|
||||||
|
# call anthropic /completion, only use this route for claude-2, claude-instant-1
|
||||||
api_base = (
|
api_base = (
|
||||||
api_base
|
api_base
|
||||||
or litellm.api_base
|
or litellm.api_base
|
||||||
or get_secret("ANTHROPIC_API_BASE")
|
or get_secret("ANTHROPIC_API_BASE")
|
||||||
or "https://api.anthropic.com/v1/complete"
|
or "https://api.anthropic.com/v1/complete"
|
||||||
)
|
)
|
||||||
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
|
response = anthropic_text.completion(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
api_base=api_base,
|
||||||
|
custom_prompt_dict=litellm.custom_prompt_dict,
|
||||||
|
model_response=model_response,
|
||||||
|
print_verbose=print_verbose,
|
||||||
|
optional_params=optional_params,
|
||||||
|
litellm_params=litellm_params,
|
||||||
|
logger_fn=logger_fn,
|
||||||
|
encoding=encoding, # for calculating input/output tokens
|
||||||
|
api_key=api_key,
|
||||||
|
logging_obj=logging,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# call /messages
|
||||||
|
# default route for all anthropic models
|
||||||
|
api_base = (
|
||||||
|
api_base
|
||||||
|
or litellm.api_base
|
||||||
|
or get_secret("ANTHROPIC_API_BASE")
|
||||||
|
or "https://api.anthropic.com/v1/messages"
|
||||||
|
)
|
||||||
response = anthropic.completion(
|
response = anthropic.completion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -2389,6 +2423,7 @@ def embedding(
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
"cache",
|
"cache",
|
||||||
|
"no-log",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
@ -2589,6 +2624,7 @@ def embedding(
|
||||||
vertex_project=vertex_ai_project,
|
vertex_project=vertex_ai_project,
|
||||||
vertex_location=vertex_ai_location,
|
vertex_location=vertex_ai_location,
|
||||||
aembedding=aembedding,
|
aembedding=aembedding,
|
||||||
|
print_verbose=print_verbose,
|
||||||
)
|
)
|
||||||
elif custom_llm_provider == "oobabooga":
|
elif custom_llm_provider == "oobabooga":
|
||||||
response = oobabooga.embedding(
|
response = oobabooga.embedding(
|
||||||
|
@ -3014,7 +3050,6 @@ def moderation(
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
##### Moderation #######################
|
|
||||||
@client
|
@client
|
||||||
async def amoderation(input: str, model: str, api_key: Optional[str] = None, **kwargs):
|
async def amoderation(input: str, model: str, api_key: Optional[str] = None, **kwargs):
|
||||||
# only supports open ai for now
|
# only supports open ai for now
|
||||||
|
@ -3037,11 +3072,11 @@ async def aimage_generation(*args, **kwargs):
|
||||||
Asynchronously calls the `image_generation` function with the given arguments and keyword arguments.
|
Asynchronously calls the `image_generation` function with the given arguments and keyword arguments.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- `args` (tuple): Positional arguments to be passed to the `embedding` function.
|
- `args` (tuple): Positional arguments to be passed to the `image_generation` function.
|
||||||
- `kwargs` (dict): Keyword arguments to be passed to the `embedding` function.
|
- `kwargs` (dict): Keyword arguments to be passed to the `image_generation` function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
- `response` (Any): The response returned by the `embedding` function.
|
- `response` (Any): The response returned by the `image_generation` function.
|
||||||
"""
|
"""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
model = args[0] if len(args) > 0 else kwargs["model"]
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
||||||
|
@ -3063,7 +3098,7 @@ async def aimage_generation(*args, **kwargs):
|
||||||
# Await normally
|
# Await normally
|
||||||
init_response = await loop.run_in_executor(None, func_with_context)
|
init_response = await loop.run_in_executor(None, func_with_context)
|
||||||
if isinstance(init_response, dict) or isinstance(
|
if isinstance(init_response, dict) or isinstance(
|
||||||
init_response, ModelResponse
|
init_response, ImageResponse
|
||||||
): ## CACHING SCENARIO
|
): ## CACHING SCENARIO
|
||||||
response = init_response
|
response = init_response
|
||||||
elif asyncio.iscoroutine(init_response):
|
elif asyncio.iscoroutine(init_response):
|
||||||
|
@ -3281,6 +3316,144 @@ def image_generation(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
##### Transcription #######################
|
||||||
|
|
||||||
|
|
||||||
|
@client
|
||||||
|
async def atranscription(*args, **kwargs):
|
||||||
|
"""
|
||||||
|
Calls openai + azure whisper endpoints.
|
||||||
|
|
||||||
|
Allows router to load balance between them
|
||||||
|
"""
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
||||||
|
### PASS ARGS TO Image Generation ###
|
||||||
|
kwargs["atranscription"] = True
|
||||||
|
custom_llm_provider = None
|
||||||
|
try:
|
||||||
|
# Use a partial function to pass your keyword arguments
|
||||||
|
func = partial(transcription, *args, **kwargs)
|
||||||
|
|
||||||
|
# Add the context to the function
|
||||||
|
ctx = contextvars.copy_context()
|
||||||
|
func_with_context = partial(ctx.run, func)
|
||||||
|
|
||||||
|
_, custom_llm_provider, _, _ = get_llm_provider(
|
||||||
|
model=model, api_base=kwargs.get("api_base", None)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Await normally
|
||||||
|
init_response = await loop.run_in_executor(None, func_with_context)
|
||||||
|
if isinstance(init_response, dict) or isinstance(
|
||||||
|
init_response, TranscriptionResponse
|
||||||
|
): ## CACHING SCENARIO
|
||||||
|
response = init_response
|
||||||
|
elif asyncio.iscoroutine(init_response):
|
||||||
|
response = await init_response
|
||||||
|
else:
|
||||||
|
# Call the synchronous function using run_in_executor
|
||||||
|
response = await loop.run_in_executor(None, func_with_context)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
custom_llm_provider = custom_llm_provider or "openai"
|
||||||
|
raise exception_type(
|
||||||
|
model=model,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
original_exception=e,
|
||||||
|
completion_kwargs=args,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@client
|
||||||
|
def transcription(
|
||||||
|
model: str,
|
||||||
|
file: BinaryIO,
|
||||||
|
## OPTIONAL OPENAI PARAMS ##
|
||||||
|
language: Optional[str] = None,
|
||||||
|
prompt: Optional[str] = None,
|
||||||
|
response_format: Optional[
|
||||||
|
Literal["json", "text", "srt", "verbose_json", "vtt"]
|
||||||
|
] = None,
|
||||||
|
temperature: Optional[int] = None, # openai defaults this to 0
|
||||||
|
## LITELLM PARAMS ##
|
||||||
|
user: Optional[str] = None,
|
||||||
|
timeout=600, # default to 10 minutes
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
api_version: Optional[str] = None,
|
||||||
|
litellm_logging_obj=None,
|
||||||
|
custom_llm_provider=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Calls openai + azure whisper endpoints.
|
||||||
|
|
||||||
|
Allows router to load balance between them
|
||||||
|
"""
|
||||||
|
atranscription = kwargs.get("atranscription", False)
|
||||||
|
litellm_call_id = kwargs.get("litellm_call_id", None)
|
||||||
|
logger_fn = kwargs.get("logger_fn", None)
|
||||||
|
proxy_server_request = kwargs.get("proxy_server_request", None)
|
||||||
|
model_info = kwargs.get("model_info", None)
|
||||||
|
metadata = kwargs.get("metadata", {})
|
||||||
|
|
||||||
|
model_response = litellm.utils.TranscriptionResponse()
|
||||||
|
|
||||||
|
model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore
|
||||||
|
|
||||||
|
optional_params = {
|
||||||
|
"language": language,
|
||||||
|
"prompt": prompt,
|
||||||
|
"response_format": response_format,
|
||||||
|
"temperature": None, # openai defaults this to 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if custom_llm_provider == "azure":
|
||||||
|
# azure configs
|
||||||
|
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
|
||||||
|
|
||||||
|
api_version = (
|
||||||
|
api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
|
||||||
|
)
|
||||||
|
|
||||||
|
azure_ad_token = kwargs.pop("azure_ad_token", None) or get_secret(
|
||||||
|
"AZURE_AD_TOKEN"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_key = (
|
||||||
|
api_key
|
||||||
|
or litellm.api_key
|
||||||
|
or litellm.azure_key
|
||||||
|
or get_secret("AZURE_API_KEY")
|
||||||
|
)
|
||||||
|
|
||||||
|
response = azure_chat_completions.audio_transcriptions(
|
||||||
|
model=model,
|
||||||
|
audio_file=file,
|
||||||
|
optional_params=optional_params,
|
||||||
|
model_response=model_response,
|
||||||
|
atranscription=atranscription,
|
||||||
|
timeout=timeout,
|
||||||
|
logging_obj=litellm_logging_obj,
|
||||||
|
api_base=api_base,
|
||||||
|
api_key=api_key,
|
||||||
|
api_version=api_version,
|
||||||
|
azure_ad_token=azure_ad_token,
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "openai":
|
||||||
|
response = openai_chat_completions.audio_transcriptions(
|
||||||
|
model=model,
|
||||||
|
audio_file=file,
|
||||||
|
optional_params=optional_params,
|
||||||
|
model_response=model_response,
|
||||||
|
atranscription=atranscription,
|
||||||
|
timeout=timeout,
|
||||||
|
logging_obj=litellm_logging_obj,
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
##### Health Endpoints #######################
|
##### Health Endpoints #######################
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@
|
||||||
},
|
},
|
||||||
"gpt-3.5-turbo": {
|
"gpt-3.5-turbo": {
|
||||||
"max_tokens": 4097,
|
"max_tokens": 4097,
|
||||||
"max_input_tokens": 4097,
|
"max_input_tokens": 16385,
|
||||||
"max_output_tokens": 4096,
|
"max_output_tokens": 4096,
|
||||||
"input_cost_per_token": 0.0000015,
|
"input_cost_per_token": 0.0000015,
|
||||||
"output_cost_per_token": 0.000002,
|
"output_cost_per_token": 0.000002,
|
||||||
|
@ -293,6 +293,18 @@
|
||||||
"output_cost_per_pixel": 0.0,
|
"output_cost_per_pixel": 0.0,
|
||||||
"litellm_provider": "openai"
|
"litellm_provider": "openai"
|
||||||
},
|
},
|
||||||
|
"whisper-1": {
|
||||||
|
"mode": "audio_transcription",
|
||||||
|
"input_cost_per_second": 0,
|
||||||
|
"output_cost_per_second": 0.0001,
|
||||||
|
"litellm_provider": "openai"
|
||||||
|
},
|
||||||
|
"azure/whisper-1": {
|
||||||
|
"mode": "audio_transcription",
|
||||||
|
"input_cost_per_second": 0,
|
||||||
|
"output_cost_per_second": 0.0001,
|
||||||
|
"litellm_provider": "azure"
|
||||||
|
},
|
||||||
"azure/gpt-4-0125-preview": {
|
"azure/gpt-4-0125-preview": {
|
||||||
"max_tokens": 128000,
|
"max_tokens": 128000,
|
||||||
"max_input_tokens": 128000,
|
"max_input_tokens": 128000,
|
||||||
|
@ -424,6 +436,23 @@
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_function_calling": true
|
"supports_function_calling": true
|
||||||
},
|
},
|
||||||
|
"azure/gpt-3.5-turbo-instruct-0914": {
|
||||||
|
"max_tokens": 4097,
|
||||||
|
|
||||||
|
"input_cost_per_token": 0.0000015,
|
||||||
|
"output_cost_per_token": 0.000002,
|
||||||
|
"litellm_provider": "text-completion-openai",
|
||||||
|
"mode": "completion"
|
||||||
|
|
||||||
|
},
|
||||||
|
"azure/gpt-35-turbo-instruct": {
|
||||||
|
"max_tokens": 4097,
|
||||||
|
"input_cost_per_token": 0.0000015,
|
||||||
|
"output_cost_per_token": 0.000002,
|
||||||
|
"litellm_provider": "text-completion-openai",
|
||||||
|
"mode": "completion"
|
||||||
|
|
||||||
|
},
|
||||||
"azure/mistral-large-latest": {
|
"azure/mistral-large-latest": {
|
||||||
"max_tokens": 32000,
|
"max_tokens": 32000,
|
||||||
"input_cost_per_token": 0.000008,
|
"input_cost_per_token": 0.000008,
|
||||||
|
@ -537,6 +566,14 @@
|
||||||
"litellm_provider": "text-completion-openai",
|
"litellm_provider": "text-completion-openai",
|
||||||
"mode": "completion"
|
"mode": "completion"
|
||||||
},
|
},
|
||||||
|
"gpt-3.5-turbo-instruct-0914": {
|
||||||
|
"max_tokens": 4097,
|
||||||
|
"input_cost_per_token": 0.0000015,
|
||||||
|
"output_cost_per_token": 0.000002,
|
||||||
|
"litellm_provider": "text-completion-openai",
|
||||||
|
"mode": "completion"
|
||||||
|
|
||||||
|
},
|
||||||
"claude-instant-1": {
|
"claude-instant-1": {
|
||||||
"max_tokens": 100000,
|
"max_tokens": 100000,
|
||||||
"max_output_tokens": 8191,
|
"max_output_tokens": 8191,
|
||||||
|
@ -618,6 +655,22 @@
|
||||||
"litellm_provider": "anthropic",
|
"litellm_provider": "anthropic",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"claude-3-opus-20240229": {
|
||||||
|
"max_tokens": 200000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.000015,
|
||||||
|
"output_cost_per_token": 0.000075,
|
||||||
|
"litellm_provider": "anthropic",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"claude-3-sonnet-20240229": {
|
||||||
|
"max_tokens": 200000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.000003,
|
||||||
|
"output_cost_per_token": 0.000015,
|
||||||
|
"litellm_provider": "anthropic",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"text-bison": {
|
"text-bison": {
|
||||||
"max_tokens": 8192,
|
"max_tokens": 8192,
|
||||||
"input_cost_per_token": 0.000000125,
|
"input_cost_per_token": 0.000000125,
|
||||||
|
@ -1211,6 +1264,29 @@
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "embedding"
|
"mode": "embedding"
|
||||||
},
|
},
|
||||||
|
"bedrock/us-west-2/mistral.mixtral-8x7b-instruct": {
|
||||||
|
"max_tokens": 32000,
|
||||||
|
"input_cost_per_token": 0.00000045,
|
||||||
|
"output_cost_per_token": 0.0000007,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "completion"
|
||||||
|
},
|
||||||
|
"bedrock/us-west-2/mistral.mistral-7b-instruct": {
|
||||||
|
"max_tokens": 32000,
|
||||||
|
"input_cost_per_token": 0.00000015,
|
||||||
|
"output_cost_per_token": 0.0000002,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "completion"
|
||||||
|
},
|
||||||
|
"anthropic.claude-3-sonnet-20240229-v1:0": {
|
||||||
|
"max_tokens": 200000,
|
||||||
|
"max_input_tokens": 200000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.000003,
|
||||||
|
"output_cost_per_token": 0.000015,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"anthropic.claude-v1": {
|
"anthropic.claude-v1": {
|
||||||
"max_tokens": 100000,
|
"max_tokens": 100000,
|
||||||
"max_output_tokens": 8191,
|
"max_output_tokens": 8191,
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/16eb955147cb6b2f.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/32e93a3d13512de5.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-6b93c4e1d000ff14.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a85b2c176012d8e5.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e1b183dda365ec86.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-6b93c4e1d000ff14.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/16eb955147cb6b2f.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[9125,[\"730\",\"static/chunks/730-1411b729a1c79695.js\",\"931\",\"static/chunks/app/page-ad3e13d2fec661b5.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/16eb955147cb6b2f.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"SP1Cm97dc_3zo4HlsJJjg\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-59d9232c3e7a8be6.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a85b2c176012d8e5.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e1b183dda365ec86.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-59d9232c3e7a8be6.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/32e93a3d13512de5.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[57492,[\"730\",\"static/chunks/730-1411b729a1c79695.js\",\"931\",\"static/chunks/app/page-2ed0bc91ffef505b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/32e93a3d13512de5.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"ZF-EluyKCEJoZptE3dOXT\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
|
@ -1,7 +1,7 @@
|
||||||
2:I[77831,[],""]
|
2:I[77831,[],""]
|
||||||
3:I[9125,["730","static/chunks/730-1411b729a1c79695.js","931","static/chunks/app/page-ad3e13d2fec661b5.js"],""]
|
3:I[57492,["730","static/chunks/730-1411b729a1c79695.js","931","static/chunks/app/page-2ed0bc91ffef505b.js"],""]
|
||||||
4:I[5613,[],""]
|
4:I[5613,[],""]
|
||||||
5:I[31778,[],""]
|
5:I[31778,[],""]
|
||||||
0:["SP1Cm97dc_3zo4HlsJJjg",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/16eb955147cb6b2f.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
0:["ZF-EluyKCEJoZptE3dOXT",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/32e93a3d13512de5.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||||
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||||
1:null
|
1:null
|
||||||
|
|
|
@ -33,6 +33,9 @@ class LiteLLMBase(BaseModel):
|
||||||
# if using pydantic v1
|
# if using pydantic v1
|
||||||
return self.__fields_set__
|
return self.__fields_set__
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
######### Request Class Definition ######
|
######### Request Class Definition ######
|
||||||
class ProxyChatCompletionRequest(LiteLLMBase):
|
class ProxyChatCompletionRequest(LiteLLMBase):
|
||||||
|
@ -151,6 +154,7 @@ class GenerateRequestBase(LiteLLMBase):
|
||||||
rpm_limit: Optional[int] = None
|
rpm_limit: Optional[int] = None
|
||||||
budget_duration: Optional[str] = None
|
budget_duration: Optional[str] = None
|
||||||
allowed_cache_controls: Optional[list] = []
|
allowed_cache_controls: Optional[list] = []
|
||||||
|
soft_budget: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
class GenerateKeyRequest(GenerateRequestBase):
|
class GenerateKeyRequest(GenerateRequestBase):
|
||||||
|
@ -208,6 +212,12 @@ class KeyRequest(LiteLLMBase):
|
||||||
keys: List[str]
|
keys: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class LiteLLM_ModelTable(LiteLLMBase):
|
||||||
|
model_aliases: Optional[str] = None # json dump the dict
|
||||||
|
created_by: str
|
||||||
|
updated_by: str
|
||||||
|
|
||||||
|
|
||||||
class NewUserRequest(GenerateKeyRequest):
|
class NewUserRequest(GenerateKeyRequest):
|
||||||
max_budget: Optional[float] = None
|
max_budget: Optional[float] = None
|
||||||
user_email: Optional[str] = None
|
user_email: Optional[str] = None
|
||||||
|
@ -247,9 +257,10 @@ class Member(LiteLLMBase):
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
class NewTeamRequest(LiteLLMBase):
|
class TeamBase(LiteLLMBase):
|
||||||
team_alias: Optional[str] = None
|
team_alias: Optional[str] = None
|
||||||
team_id: Optional[str] = None
|
team_id: Optional[str] = None
|
||||||
|
organization_id: Optional[str] = None
|
||||||
admins: list = []
|
admins: list = []
|
||||||
members: list = []
|
members: list = []
|
||||||
members_with_roles: List[Member] = []
|
members_with_roles: List[Member] = []
|
||||||
|
@ -260,6 +271,14 @@ class NewTeamRequest(LiteLLMBase):
|
||||||
models: list = []
|
models: list = []
|
||||||
|
|
||||||
|
|
||||||
|
class NewTeamRequest(TeamBase):
|
||||||
|
model_aliases: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class GlobalEndUsersSpend(LiteLLMBase):
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class TeamMemberAddRequest(LiteLLMBase):
|
class TeamMemberAddRequest(LiteLLMBase):
|
||||||
team_id: str
|
team_id: str
|
||||||
member: Member
|
member: Member
|
||||||
|
@ -290,11 +309,12 @@ class DeleteTeamRequest(LiteLLMBase):
|
||||||
team_ids: List[str] # required
|
team_ids: List[str] # required
|
||||||
|
|
||||||
|
|
||||||
class LiteLLM_TeamTable(NewTeamRequest):
|
class LiteLLM_TeamTable(TeamBase):
|
||||||
spend: Optional[float] = None
|
spend: Optional[float] = None
|
||||||
max_parallel_requests: Optional[int] = None
|
max_parallel_requests: Optional[int] = None
|
||||||
budget_duration: Optional[str] = None
|
budget_duration: Optional[str] = None
|
||||||
budget_reset_at: Optional[datetime] = None
|
budget_reset_at: Optional[datetime] = None
|
||||||
|
model_id: Optional[int] = None
|
||||||
|
|
||||||
@root_validator(pre=True)
|
@root_validator(pre=True)
|
||||||
def set_model_info(cls, values):
|
def set_model_info(cls, values):
|
||||||
|
@ -304,6 +324,7 @@ class LiteLLM_TeamTable(NewTeamRequest):
|
||||||
"config",
|
"config",
|
||||||
"permissions",
|
"permissions",
|
||||||
"model_max_budget",
|
"model_max_budget",
|
||||||
|
"model_aliases",
|
||||||
]
|
]
|
||||||
for field in dict_fields:
|
for field in dict_fields:
|
||||||
value = values.get(field)
|
value = values.get(field)
|
||||||
|
@ -320,6 +341,49 @@ class TeamRequest(LiteLLMBase):
|
||||||
teams: List[str]
|
teams: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class LiteLLM_BudgetTable(LiteLLMBase):
|
||||||
|
"""Represents user-controllable params for a LiteLLM_BudgetTable record"""
|
||||||
|
|
||||||
|
soft_budget: Optional[float] = None
|
||||||
|
max_budget: Optional[float] = None
|
||||||
|
max_parallel_requests: Optional[int] = None
|
||||||
|
tpm_limit: Optional[int] = None
|
||||||
|
rpm_limit: Optional[int] = None
|
||||||
|
model_max_budget: Optional[dict] = None
|
||||||
|
budget_duration: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class NewOrganizationRequest(LiteLLM_BudgetTable):
|
||||||
|
organization_alias: str
|
||||||
|
models: List = []
|
||||||
|
budget_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LiteLLM_OrganizationTable(LiteLLMBase):
|
||||||
|
"""Represents user-controllable params for a LiteLLM_OrganizationTable record"""
|
||||||
|
|
||||||
|
organization_alias: Optional[str] = None
|
||||||
|
budget_id: str
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
models: List[str]
|
||||||
|
created_by: str
|
||||||
|
updated_by: str
|
||||||
|
|
||||||
|
|
||||||
|
class NewOrganizationResponse(LiteLLM_OrganizationTable):
|
||||||
|
organization_id: str
|
||||||
|
created_at: datetime
|
||||||
|
updated_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class OrganizationRequest(LiteLLMBase):
|
||||||
|
organizations: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class BudgetRequest(LiteLLMBase):
|
||||||
|
budgets: List[str]
|
||||||
|
|
||||||
|
|
||||||
class KeyManagementSystem(enum.Enum):
|
class KeyManagementSystem(enum.Enum):
|
||||||
GOOGLE_KMS = "google_kms"
|
GOOGLE_KMS = "google_kms"
|
||||||
AZURE_KEY_VAULT = "azure_key_vault"
|
AZURE_KEY_VAULT = "azure_key_vault"
|
||||||
|
@ -489,6 +553,8 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
|
||||||
team_tpm_limit: Optional[int] = None
|
team_tpm_limit: Optional[int] = None
|
||||||
team_rpm_limit: Optional[int] = None
|
team_rpm_limit: Optional[int] = None
|
||||||
team_max_budget: Optional[float] = None
|
team_max_budget: Optional[float] = None
|
||||||
|
soft_budget: Optional[float] = None
|
||||||
|
team_model_aliases: Optional[Dict] = None
|
||||||
|
|
||||||
|
|
||||||
class UserAPIKeyAuth(
|
class UserAPIKeyAuth(
|
||||||
|
@ -538,6 +604,7 @@ class LiteLLM_SpendLogs(LiteLLMBase):
|
||||||
request_id: str
|
request_id: str
|
||||||
api_key: str
|
api_key: str
|
||||||
model: Optional[str] = ""
|
model: Optional[str] = ""
|
||||||
|
api_base: Optional[str] = ""
|
||||||
call_type: str
|
call_type: str
|
||||||
spend: Optional[float] = 0.0
|
spend: Optional[float] = 0.0
|
||||||
total_tokens: Optional[int] = 0
|
total_tokens: Optional[int] = 0
|
||||||
|
|
|
@ -71,7 +71,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
|
||||||
):
|
):
|
||||||
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
|
self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
|
||||||
api_key = user_api_key_dict.api_key
|
api_key = user_api_key_dict.api_key
|
||||||
max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
|
max_parallel_requests = user_api_key_dict.max_parallel_requests
|
||||||
|
if max_parallel_requests is None:
|
||||||
|
max_parallel_requests = sys.maxsize
|
||||||
tpm_limit = getattr(user_api_key_dict, "tpm_limit", sys.maxsize)
|
tpm_limit = getattr(user_api_key_dict, "tpm_limit", sys.maxsize)
|
||||||
if tpm_limit is None:
|
if tpm_limit is None:
|
||||||
tpm_limit = sys.maxsize
|
tpm_limit = sys.maxsize
|
||||||
|
@ -105,6 +107,10 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
|
||||||
and rpm_limit == sys.maxsize
|
and rpm_limit == sys.maxsize
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
elif max_parallel_requests == 0 or tpm_limit == 0 or rpm_limit == 0:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429, detail="Max parallel request limit reached."
|
||||||
|
)
|
||||||
elif current is None:
|
elif current is None:
|
||||||
new_val = {
|
new_val = {
|
||||||
"current_requests": 1,
|
"current_requests": 1,
|
||||||
|
|
|
@ -16,9 +16,18 @@ from importlib import resources
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
telemetry = None
|
telemetry = None
|
||||||
|
default_num_workers = 1
|
||||||
|
try:
|
||||||
|
default_num_workers = os.cpu_count() or 1
|
||||||
|
if default_num_workers is not None and default_num_workers > 0:
|
||||||
|
default_num_workers -= 1
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def append_query_params(url, params):
|
def append_query_params(url, params):
|
||||||
|
print(f"url: {url}")
|
||||||
|
print(f"params: {params}")
|
||||||
parsed_url = urlparse.urlparse(url)
|
parsed_url = urlparse.urlparse(url)
|
||||||
parsed_query = urlparse.parse_qs(parsed_url.query)
|
parsed_query = urlparse.parse_qs(parsed_url.query)
|
||||||
parsed_query.update(params)
|
parsed_query.update(params)
|
||||||
|
@ -52,10 +61,10 @@ def is_port_in_use(port):
|
||||||
@click.option(
|
@click.option(
|
||||||
"--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST"
|
"--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST"
|
||||||
)
|
)
|
||||||
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
|
@click.option("--port", default=4000, help="Port to bind the server to.", envvar="PORT")
|
||||||
@click.option(
|
@click.option(
|
||||||
"--num_workers",
|
"--num_workers",
|
||||||
default=1,
|
default=default_num_workers,
|
||||||
help="Number of gunicorn workers to spin up",
|
help="Number of gunicorn workers to spin up",
|
||||||
envvar="NUM_WORKERS",
|
envvar="NUM_WORKERS",
|
||||||
)
|
)
|
||||||
|
@ -264,7 +273,7 @@ def run_server(
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
|
response = requests.post("http://0.0.0.0:4000/queue/request", json=data)
|
||||||
|
|
||||||
response = response.json()
|
response = response.json()
|
||||||
|
|
||||||
|
@ -498,7 +507,7 @@ def run_server(
|
||||||
print(
|
print(
|
||||||
f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
|
f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
|
||||||
)
|
)
|
||||||
if port == 8000 and is_port_in_use(port):
|
if port == 4000 and is_port_in_use(port):
|
||||||
port = random.randint(1024, 49152)
|
port = random.randint(1024, 49152)
|
||||||
|
|
||||||
from litellm.proxy.proxy_server import app
|
from litellm.proxy.proxy_server import app
|
||||||
|
|
|
@ -5,63 +5,9 @@ model_list:
|
||||||
api_base: os.environ/AZURE_API_BASE
|
api_base: os.environ/AZURE_API_BASE
|
||||||
api_key: os.environ/AZURE_API_KEY
|
api_key: os.environ/AZURE_API_KEY
|
||||||
api_version: "2023-07-01-preview"
|
api_version: "2023-07-01-preview"
|
||||||
model_info:
|
|
||||||
mode: chat
|
|
||||||
max_tokens: 4096
|
|
||||||
base_model: azure/gpt-4-1106-preview
|
|
||||||
access_groups: ["public"]
|
|
||||||
- model_name: openai-gpt-3.5
|
|
||||||
litellm_params:
|
|
||||||
model: gpt-3.5-turbo
|
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
|
||||||
model_info:
|
|
||||||
access_groups: ["public"]
|
|
||||||
- model_name: anthropic-claude-v2.1
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/anthropic.claude-v2:1
|
|
||||||
timeout: 300 # sets a 5 minute timeout
|
|
||||||
model_info:
|
|
||||||
access_groups: ["private"]
|
|
||||||
- model_name: anthropic-claude-v2
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/anthropic.claude-v2
|
|
||||||
- model_name: bedrock-cohere
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/cohere.command-text-v14
|
|
||||||
timeout: 0.0001
|
|
||||||
- model_name: gpt-4
|
|
||||||
litellm_params:
|
|
||||||
model: azure/chatgpt-v-2
|
|
||||||
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
|
||||||
api_version: "2023-05-15"
|
|
||||||
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
|
|
||||||
model_info:
|
|
||||||
base_model: azure/gpt-4
|
|
||||||
- model_name: text-moderation-stable
|
|
||||||
litellm_params:
|
|
||||||
model: text-moderation-stable
|
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
|
set_verbose: True
|
||||||
success_callback: ['langfuse']
|
success_callback: ["langfuse"]
|
||||||
# setting callback class
|
router_settings:
|
||||||
callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
set_verbose: True
|
||||||
|
debug_level: "DEBUG"
|
||||||
general_settings:
|
|
||||||
master_key: sk-1234
|
|
||||||
alerting: ["slack"]
|
|
||||||
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
|
|
||||||
# database_type: "dynamo_db"
|
|
||||||
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
|
|
||||||
# "billing_mode": "PAY_PER_REQUEST",
|
|
||||||
# "region_name": "us-west-2",
|
|
||||||
# "ssl_verify": False
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
environment_variables:
|
|
||||||
# otel: True # OpenTelemetry Logger
|
|
||||||
# master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
|
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: openai/my-fake-model
|
||||||
|
api_key: my-fake-key
|
||||||
|
api_base: http://0.0.0.0:8090
|
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from locust import HttpUser, task, between
|
||||||
|
|
||||||
|
|
||||||
|
class MyUser(HttpUser):
|
||||||
|
wait_time = between(1, 5)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def chat_completion(self):
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
# Include any additional headers you may need for authentication, etc.
|
||||||
|
}
|
||||||
|
|
||||||
|
# Customize the payload with "model" and "messages" keys
|
||||||
|
payload = {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a chat bot."},
|
||||||
|
{"role": "user", "content": "Hello, how are you?"},
|
||||||
|
],
|
||||||
|
# Add more data as necessary
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make a POST request to the "chat/completions" endpoint
|
||||||
|
response = self.client.post("chat/completions", json=payload, headers=headers)
|
||||||
|
|
||||||
|
# Print or log the response if needed
|
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# import sys, os
|
||||||
|
# sys.path.insert(
|
||||||
|
# 0, os.path.abspath("../")
|
||||||
|
# ) # Adds the parent directory to the system path
|
||||||
|
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# for completion
|
||||||
|
@app.post("/chat/completions")
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def completion(request: Request):
|
||||||
|
return {
|
||||||
|
"id": "chatcmpl-123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1677652288,
|
||||||
|
"model": "gpt-3.5-turbo-0125",
|
||||||
|
"system_fingerprint": "fp_44709d6fcb",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "\n\nHello there, how may I assist you today?",
|
||||||
|
},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
# run this on 8090, 8091, 8092 and 8093
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8090)
|
File diff suppressed because it is too large
Load diff
|
@ -7,10 +7,57 @@ generator client {
|
||||||
provider = "prisma-client-py"
|
provider = "prisma-client-py"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Budget / Rate Limits for an org
|
||||||
|
model LiteLLM_BudgetTable {
|
||||||
|
budget_id String @id @default(uuid())
|
||||||
|
max_budget Float?
|
||||||
|
soft_budget Float?
|
||||||
|
max_parallel_requests Int?
|
||||||
|
tpm_limit BigInt?
|
||||||
|
rpm_limit BigInt?
|
||||||
|
model_max_budget Json?
|
||||||
|
budget_duration String?
|
||||||
|
budget_reset_at DateTime?
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
organization LiteLLM_OrganizationTable[] // multiple orgs can have the same budget
|
||||||
|
keys LiteLLM_VerificationToken[] // multiple keys can have the same budget
|
||||||
|
}
|
||||||
|
|
||||||
|
model LiteLLM_OrganizationTable {
|
||||||
|
organization_id String @id @default(uuid())
|
||||||
|
organization_alias String
|
||||||
|
budget_id String
|
||||||
|
metadata Json @default("{}")
|
||||||
|
models String[]
|
||||||
|
spend Float @default(0.0)
|
||||||
|
model_spend Json @default("{}")
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
|
teams LiteLLM_TeamTable[]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model info for teams, just has model aliases for now.
|
||||||
|
model LiteLLM_ModelTable {
|
||||||
|
id Int @id @default(autoincrement())
|
||||||
|
model_aliases Json? @map("aliases")
|
||||||
|
created_at DateTime @default(now()) @map("created_at")
|
||||||
|
created_by String
|
||||||
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
|
updated_by String
|
||||||
|
team LiteLLM_TeamTable?
|
||||||
|
}
|
||||||
|
|
||||||
// Assign prod keys to groups, not individuals
|
// Assign prod keys to groups, not individuals
|
||||||
model LiteLLM_TeamTable {
|
model LiteLLM_TeamTable {
|
||||||
team_id String @unique
|
team_id String @id @default(uuid())
|
||||||
team_alias String?
|
team_alias String?
|
||||||
|
organization_id String?
|
||||||
admins String[]
|
admins String[]
|
||||||
members String[]
|
members String[]
|
||||||
members_with_roles Json @default("{}")
|
members_with_roles Json @default("{}")
|
||||||
|
@ -27,11 +74,14 @@ model LiteLLM_TeamTable {
|
||||||
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
updated_at DateTime @default(now()) @updatedAt @map("updated_at")
|
||||||
model_spend Json @default("{}")
|
model_spend Json @default("{}")
|
||||||
model_max_budget Json @default("{}")
|
model_max_budget Json @default("{}")
|
||||||
|
model_id Int? @unique
|
||||||
|
litellm_organization_table LiteLLM_OrganizationTable? @relation(fields: [organization_id], references: [organization_id])
|
||||||
|
litellm_model_table LiteLLM_ModelTable? @relation(fields: [model_id], references: [id])
|
||||||
}
|
}
|
||||||
|
|
||||||
// Track spend, rate limit, budget Users
|
// Track spend, rate limit, budget Users
|
||||||
model LiteLLM_UserTable {
|
model LiteLLM_UserTable {
|
||||||
user_id String @unique
|
user_id String @id
|
||||||
team_id String?
|
team_id String?
|
||||||
teams String[] @default([])
|
teams String[] @default([])
|
||||||
user_role String?
|
user_role String?
|
||||||
|
@ -51,9 +101,10 @@ model LiteLLM_UserTable {
|
||||||
|
|
||||||
// Generate Tokens for Proxy
|
// Generate Tokens for Proxy
|
||||||
model LiteLLM_VerificationToken {
|
model LiteLLM_VerificationToken {
|
||||||
token String @unique
|
token String @id
|
||||||
key_name String?
|
key_name String?
|
||||||
key_alias String?
|
key_alias String?
|
||||||
|
soft_budget_cooldown Boolean @default(false) // key-level state on if budget alerts need to be cooled down
|
||||||
spend Float @default(0.0)
|
spend Float @default(0.0)
|
||||||
expires DateTime?
|
expires DateTime?
|
||||||
models String[]
|
models String[]
|
||||||
|
@ -72,6 +123,8 @@ model LiteLLM_VerificationToken {
|
||||||
allowed_cache_controls String[] @default([])
|
allowed_cache_controls String[] @default([])
|
||||||
model_spend Json @default("{}")
|
model_spend Json @default("{}")
|
||||||
model_max_budget Json @default("{}")
|
model_max_budget Json @default("{}")
|
||||||
|
budget_id String?
|
||||||
|
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||||
}
|
}
|
||||||
|
|
||||||
// store proxy config.yaml
|
// store proxy config.yaml
|
||||||
|
@ -82,7 +135,7 @@ model LiteLLM_Config {
|
||||||
|
|
||||||
// View spend, model, api_key per request
|
// View spend, model, api_key per request
|
||||||
model LiteLLM_SpendLogs {
|
model LiteLLM_SpendLogs {
|
||||||
request_id String @unique
|
request_id String @id
|
||||||
call_type String
|
call_type String
|
||||||
api_key String @default ("")
|
api_key String @default ("")
|
||||||
spend Float @default(0.0)
|
spend Float @default(0.0)
|
||||||
|
@ -92,6 +145,7 @@ model LiteLLM_SpendLogs {
|
||||||
startTime DateTime // Assuming start_time is a DateTime field
|
startTime DateTime // Assuming start_time is a DateTime field
|
||||||
endTime DateTime // Assuming end_time is a DateTime field
|
endTime DateTime // Assuming end_time is a DateTime field
|
||||||
model String @default("")
|
model String @default("")
|
||||||
|
api_base String @default("")
|
||||||
user String @default("")
|
user String @default("")
|
||||||
metadata Json @default("{}")
|
metadata Json @default("{}")
|
||||||
cache_hit String @default("")
|
cache_hit String @default("")
|
||||||
|
@ -100,9 +154,10 @@ model LiteLLM_SpendLogs {
|
||||||
team_id String?
|
team_id String?
|
||||||
end_user String?
|
end_user String?
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beta - allow team members to request access to a model
|
// Beta - allow team members to request access to a model
|
||||||
model LiteLLM_UserNotifications {
|
model LiteLLM_UserNotifications {
|
||||||
request_id String @unique
|
request_id String @id
|
||||||
user_id String
|
user_id String
|
||||||
models String[]
|
models String[]
|
||||||
justification String
|
justification String
|
||||||
|
|
82
litellm/proxy/tests/large_text.py
Normal file
82
litellm/proxy/tests/large_text.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
text = """
|
||||||
|
{{Short description|Military commander and king of Macedon (356–323 BC)}}
|
||||||
|
{{About|the ancient king of Macedonia}}
|
||||||
|
{{Good article}}
|
||||||
|
{{pp-semi-indef}}
|
||||||
|
{{pp-move-indef}}
|
||||||
|
{{Use Oxford spelling|date=September 2020}}
|
||||||
|
{{Use dmy dates|date=January 2023}}
|
||||||
|
{{Infobox royalty
|
||||||
|
| name = Alexander the Great
|
||||||
|
| title = [[Basileus]]
|
||||||
|
| image = Alexander the Great mosaic (cropped).jpg
|
||||||
|
| caption = Alexander in the ''[[Alexander Mosaic]]''
|
||||||
|
| succession = [[King of Macedon]]
|
||||||
|
| reign = 336–323 BC
|
||||||
|
| predecessor = [[Philip II of Macedon|Philip II]]
|
||||||
|
| successor = {{hlist|
|
||||||
|
| [[Alexander IV of Macedon|Alexander IV]]
|
||||||
|
| [[Philip III of Macedon|Philip III]]
|
||||||
|
}}
|
||||||
|
| succession2 = [[Hegemony#8th–1st centuries BC|Hegemon]] of the [[League of Corinth|Hellenic League]]
|
||||||
|
| reign2 = 336–323 BC
|
||||||
|
| predecessor2 = Philip II
|
||||||
|
| successor2 = [[Demetrius I of Macedon]]
|
||||||
|
| succession3 = [[List of pharaohs|Pharaoh of Egypt]]
|
||||||
|
| reign3 = 332–323 BC
|
||||||
|
| predecessor3 = [[Darius III]]
|
||||||
|
| successor3 = {{hlist|
|
||||||
|
| Alexander IV
|
||||||
|
| Philip III
|
||||||
|
{{Ancient Egyptian royal titulary case |nomen={{ubl|{{transliteration|egy|ꜣrwksjndrs}}|{{transliteration|egy|Aluksindres}}|Alexandros}} |nomen_hiero=<hiero>A-rw:k:z-i-n:d:r:z</hiero> |horus={{ubl|{{transliteration|egy|mk-kmt}}|{{transliteration|egy|Mekemet}}|Protector of Egypt}} {{Infobox pharaoh/Serekh |Horus=<hiero>S-HqA-q:n:nw-D40</hiero>}}{{pb}}Second Horus name:{{ubl|{{transliteration|egy|ḥḳꜣ-ḳnj tkn-ḫꜣswt}}|{{transliteration|egy|Heqaqeni tekenkhasut}}|The brave ruler who has attacked foreign lands}} {{Infobox pharaoh/Serekh |Horus=<hiero>HqA-q:n:nw:D40-t:k:n:D54-N25:N25:N25</hiero>}}{{pb}}Third Horus name:{{ubl|{{transliteration|egy|ḥḳꜣ ḥḳꜣw nw tꜣ (r) ḏr-f}}|{{transliteration|egy|Heqa heqau nu ta (er) djeref}}|The ruler of the rulers of the entire land}} {{Infobox pharaoh/Serekh |Horus=<hiero>HqA-q-HqA-HqA-q-N33-nw-N33-N17:N34-r:f</hiero>}}Fourth Horus name:{{ubl|{{transliteration|egy|ṯmꜣ-ꜥ}}|{{transliteration|egy|Tjema'a}}|The sturdy-armed one}} {{Infobox pharaoh/Serekh |Horus=<hiero>T:mA-a</hiero>}} |nebty={{ubl|{{transliteration|egy|mꜣj wr-pḥty jṯ ḏww tꜣw ḫꜣswt}}|{{transliteration|egy|Mai werpehty itj dju tau khasut}}|The lion, great of might, who takes possession of mountains, lands, and deserts}} |nebty_hiero=<hiero>E23-wr:r-F9:F9-V15-N25:N25:N33-N17:N17:N33-N25:N25:N33</hiero> |golden={{ubl|{{transliteration|egy|kꜣ (nḫt) ḫwj bꜣḳ(t) ḥḳꜣ wꜣḏ(-wr) šnw n jtn}}|{{transliteration|egy|Ka (nakht) khui baq(et) heqa wadj(wer) shenu en Aten}}|The (strong) bull who protects Egypt, the ruler of the sea and of what the sun encircles}} |golden_hiero=<hiero>E1:n-i-w*x-D40-q:t-b-</hiero>{{pb}}<hiero>D10-HqA-M14-N35A-V9:Z1-i-t:n:HASH</hiero> |prenomen={{ubl|{{transliteration|egy|stp.n-rꜥ mrj-jmn}}|{{transliteration|egy|Setepenre meryamun}}|Chosen by Ra, beloved by Amun{{pb}}{{Infobox pharaoh/Prenomen |Prenomen=<hiero>C2\-C12-stp:n:N36</hiero>}}{{pb}}{{Infobox pharaoh/Prenomen |Prenomen=<hiero>mr\-C12\-C2-stp:n</hiero>}}}}}}
|
||||||
|
}}
|
||||||
|
| succession4 = [[King of Persia]]
|
||||||
|
| reign4 = 330–323 BC
|
||||||
|
| predecessor4 = Darius III
|
||||||
|
| successor4 = {{hlist|
|
||||||
|
| Alexander IV
|
||||||
|
| Philip III
|
||||||
|
}}
|
||||||
|
| full name =
|
||||||
|
| spouse = {{hlist|
|
||||||
|
| [[Roxana]]
|
||||||
|
| [[Stateira (wife of Alexander the Great)|Stateira]]
|
||||||
|
| [[Parysatis II|Parysatis]]
|
||||||
|
}}
|
||||||
|
| issue = {{plainlist|
|
||||||
|
* [[Alexander IV of Macedon|Alexander IV]]
|
||||||
|
* [[Heracles of Macedon|Heracles]]{{Cref2|a}}
|
||||||
|
}}
|
||||||
|
| native_lang1 = [[Ancient Greek|Greek]]
|
||||||
|
| native_lang1_name1 = {{lang|grc|Ἀλέξανδρος}}{{Cref2|b}}
|
||||||
|
| house = [[Argead dynasty|Argead]]
|
||||||
|
| house-type = Dynasty
|
||||||
|
| father = [[Philip II of Macedon]]
|
||||||
|
| mother = [[Olympias|Olympias of Epirus]]
|
||||||
|
| birth_date = 20 or 21 July 356 BC
|
||||||
|
| birth_place = [[Pella]], [[Macedonia (ancient kingdom)|Macedon]]
|
||||||
|
| death_date = 10 or 11 June 323 BC (aged 32)<!-- 32 years, 10 months and 20 days (approx.) -->
|
||||||
|
| death_place = [[Babylon]], [[Mesopotamia]], Macedonian Empire
|
||||||
|
| religion = [[Ancient Greek religion]]
|
||||||
|
}}
|
||||||
|
|
||||||
|
'''Alexander III of Macedon''' ({{lang-grc|[[wikt:Ἀλέξανδρος|Ἀλέξανδρος]]|Alexandros}}; 20/21 July 356 BC – 10/11 June 323 BC), most commonly known as '''Alexander the Great''',{{Cref2|c}} was a king of the [[Ancient Greece|ancient Greek]] kingdom of [[Macedonia (ancient kingdom)|Macedon]].{{Cref2|d}} He succeeded his father [[Philip II of Macedon|Philip II]] to the throne in 336 BC at the age of 20 and spent most of his ruling years conducting a lengthy [[military campaign]] throughout [[Western Asia]], [[Central Asia]], parts of [[South Asia]], and [[ancient Egypt|Egypt]]. By the age of 30, he had created one of the [[List of largest empires|largest empires]] in history, stretching from [[History of Greece|Greece]] to northwestern [[Historical India|India]].<ref>Bloom, Jonathan M.; Blair, Sheila S. (2009) ''The Grove Encyclopedia of Islamic Art and Architecture: Mosul to Zirid, Volume 3''. (Oxford University Press Incorporated, 2009), 385; "[Khojand, Tajikistan]; As the easternmost outpost of the empire of Alexander the Great, the city was renamed Alexandria Eschate ("furthest Alexandria") in 329 BCE."{{pb}}Golden, Peter B. ''Central Asia in World History'' (Oxford University Press, 2011), 25;"[...] his campaigns in Central Asia brought Khwarazm, Sogdia and Bactria under Graeco-Macedonian rule. As elsewhere, Alexander founded or renamed a number of cities, such as Alexandria Eschate ("Outernmost Alexandria", near modern Khojent in Tajikistan)."</ref> He was undefeated in battle and is widely considered to be one of history's greatest and most successful military commanders.{{Sfn |Yenne|2010 | page = 159}}<ref>{{cite encyclopedia|title=Alexander the Great's Achievements|encyclopedia=Britannica|url=https://www.britannica.com/summary/Alexander-the-Greats-Achievements|access-date=19 August 2021|archive-date=2 July 2021|archive-url=https://web.archive.org/web/20210702234248/https://www.britannica.com/summary/Alexander-the-Greats-Achievements|url-status=live}} "Alexander the Great was one of the greatest military strategists and leaders in world history."</ref>
|
||||||
|
|
||||||
|
Until the age of 16, Alexander was tutored by [[Aristotle]]. In 335 BC, shortly after his assumption of kingship over Macedon, he [[Alexander's Balkan campaign|campaigned in the Balkans]] and reasserted control over [[Thrace]] and parts of [[Illyria]] before marching on the city of [[Thebes, Greece|Thebes]], which was [[Battle of Thebes|subsequently destroyed in battle]]. Alexander then led the [[League of Corinth]], and used his authority to launch the [[Greek nationalism#History|pan-Hellenic project]] envisaged by his father, assuming leadership over all [[Greeks]] in their conquest of [[Greater Iran|Persia]].{{sfn|Heckel|Tritle|2009|p=99}}<ref>{{cite book |last1=Burger |first1=Michael |title=The Shaping of Western Civilization: From Antiquity to the Enlightenment |date=2008 |publisher=University of Toronto Press |isbn=978-1-55111-432-3 |page=76}}</ref>
|
||||||
|
|
||||||
|
In 334 BC, he invaded the [[Achaemenid Empire|Achaemenid Persian Empire]] and began [[Wars of Alexander the Great#Persia|a series of campaigns]] that lasted for 10 years. Following his conquest of [[Asia Minor]], Alexander broke the power of Achaemenid Persia in a series of decisive battles, including those at [[Battle of Issus|Issus]] and [[Battle of Gaugamela|Gaugamela]]; he subsequently overthrew [[Darius III]] and conquered the Achaemenid Empire in its entirety.{{Cref2|e}} After the fall of Persia, the [[Macedonian Empire]] held a vast swath of territory between the [[Adriatic Sea]] and the [[Indus River]]. Alexander endeavored to reach the "ends of the world and the Great Outer Sea" and [[Indian campaign of Alexander the Great|invaded India]] in 326 BC, achieving an important victory over [[Porus]], an ancient Indian king of present-day [[Punjab]], at the [[Battle of the Hydaspes]]. Due to the demand of his homesick troops, he eventually turned back at the [[Beas River]] and later died in 323 BC in [[Babylon]], the city of [[Mesopotamia]] that he had planned to establish as his empire's capital. [[Death of Alexander the Great|Alexander's death]] left unexecuted an additional series of planned military and mercantile campaigns that would have begun with a Greek invasion of [[Arabian Peninsula|Arabia]]. In the years following his death, [[Wars of the Diadochi|a series of civil wars]] broke out across the Macedonian Empire, eventually leading to its disintegration at the hands of the [[Diadochi]].
|
||||||
|
|
||||||
|
With his death marking the start of the [[Hellenistic period]], Alexander's legacy includes the [[cultural diffusion]] and [[syncretism]] that his conquests engendered, such as [[Greco-Buddhism]] and [[Hellenistic Judaism]]. [[List of cities founded by Alexander the Great|He founded more than twenty cities]], with the most prominent being the city of [[Alexandria]] in Egypt. Alexander's settlement of [[Greek colonisation|Greek colonists]] and the resulting spread of [[Culture of Greece|Greek culture]] led to the overwhelming dominance of [[Hellenistic civilization]] and influence as far east as the [[Indian subcontinent]]. The Hellenistic period developed through the [[Roman Empire]] into modern [[Western culture]]; the [[Greek language]] became the ''[[lingua franca]]'' of the region and was the predominant language of the [[Byzantine Empire]] up until its collapse in the mid-15th century AD. Alexander became legendary as a classical hero in the mould of [[Achilles]], featuring prominently in the historical and mythical traditions of both Greek and non-Greek cultures. His military achievements and unprecedented enduring successes in battle made him the measure against which many later military leaders would compare themselves,{{cref2|f}} and his tactics remain a significant subject of study in [[Military academy|military academies]] worldwide.{{Sfn|Yenne|2010|page=viii}}
|
||||||
|
|
||||||
|
{{TOC limit|3}}
|
||||||
|
|
||||||
|
==Early life==
|
||||||
|
|
||||||
|
===Lineage and childhood===
|
||||||
|
|
||||||
|
[[File:Archaeological Site of Pella by Joy of Museums.jpg|thumb|upright=1.2|Archaeological site of [[Pella]], Greece, Alexander's birthplace]]
|
||||||
|
{{Alexander the Great series}}
|
||||||
|
Alexander III was born in [[Pella]], the capital of the [[Macedonia (ancient kingdom)|Kingdom of Macedon]],<ref>{{cite book |last=Green |first=Peter |title=Alexander of Macedon, 356–323 B.C.: a historical biography |url=https://books.google.com/books?id=g6Wl4AKGQkIC&pg=PA559 |page=xxxiii |year=1970 |series=Hellenistic culture and society |edition=illustrated, revised reprint |publisher=University of California Press |isbn=978-0-520-07165-0 |quote=356 – Alexander born in Pella. The exact date is not known, but probably either 20 or 26 July. |access-date=20 June 2015}}</ref> on the sixth day of the [[Ancient Greek calendars|ancient Greek month]] of [[Attic calendar|Hekatombaion]], which probably corresponds to 20 July 356 BC (although the exact date is uncertain).<ref>Plutarch, ''Life of Alexander'' 3.5: {{cite web |url=https://www.livius.org/aj-al/alexander/alexander_t32.html#7 |title=The birth of Alexander the Great |work=Livius|archive-url=https://web.archive.org/web/20150320180439/https://www.livius.org/aj-al/alexander/alexander_t32.html|archive-date=20 March 2015|url-status = dead |access-date=16 December 2011 |quote=Alexander was born the sixth of [[Attic calendar|Hekatombaion]].}}</ref><ref>{{cite book |author=David George Hogarth |date=1897 |title=Philip and Alexander of Macedon : two essays in biography |url=https://archive.org/details/cu31924028251217/page/n321/mode/2up?view=theater |location=New York |publisher=Charles Scribner's Sons |pages=286–287 |access-date=9 November 2021}}</ref> He was the son of the erstwhile king of Macedon, [[Philip II of Macedon|Philip II]], and his fourth wife, [[Olympias]] (daughter of [[Neoptolemus I of Epirus|Neoptolemus I]], king of [[Epirus (ancient state)|Epirus]]).<ref>{{harvnb|McCarty|2004|p=10}}, {{harvnb|Renault|2001|p=28}}, {{harvnb|Durant|1966|p=538}}</ref>{{Cref2|g}} Although Philip had seven or eight wives, Olympias was his principal wife for some time, likely because she gave birth to Alexander.{{sfn|Roisman|Worthington|2010|p=171}}
|
||||||
|
|
||||||
|
Several legends surround Alexander's birth and childhood.{{sfn|Roisman|Worthington|2010|p=188}} According to the [[Ancient Greeks|ancient Greek]] biographer [[Plutarch]], on the eve of the consummation of her marriage to Philip, Olympias dreamed that her womb was struck by a thunderbolt that caused a flame to spread "far and wide" before dying away. Sometime after the wedding, Philip is said to have seen himself, in a dream, securing his wife's womb with a [[Seal (emblem)|seal]] engraved with a lion's image.<ref name="PA2" /> Plutarch offered a variety of interpretations for these dreams: that Olympias was pregnant before her marriage, indicated by the sealing of her womb; or that Alexander's father was [[Zeus]]. Ancient commentators were divided about whether the ambitious Olympias promulgated the story of Alexander's divine parentage, variously claiming that she had told Alexander, or that she dismissed the suggestion as impious.<ref name="PA2" />
|
||||||
|
"""
|
|
@ -1,20 +1,24 @@
|
||||||
import time, asyncio
|
import time, asyncio, os
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
import uuid
|
import uuid
|
||||||
import traceback
|
import traceback
|
||||||
|
from large_text import text
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
||||||
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:8000", api_key="any")
|
|
||||||
|
|
||||||
|
|
||||||
async def litellm_completion():
|
async def litellm_completion():
|
||||||
# Your existing code for litellm_completion goes here
|
# Your existing code for litellm_completion goes here
|
||||||
try:
|
try:
|
||||||
response = await litellm_client.chat.completions.create(
|
response = await litellm_client.chat.completions.create(
|
||||||
model="azure-gpt-3.5",
|
model="fake_openai",
|
||||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"{text}. Who was alexander the great? {uuid.uuid4()}",
|
||||||
|
}
|
||||||
|
],
|
||||||
)
|
)
|
||||||
print(response)
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -25,9 +29,9 @@ async def litellm_completion():
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
for i in range(150):
|
for i in range(6):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
n = 150 # Number of concurrent tasks
|
n = 20 # Number of concurrent tasks
|
||||||
tasks = [litellm_completion() for _ in range(n)]
|
tasks = [litellm_completion() for _ in range(n)]
|
||||||
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
@ -41,7 +45,6 @@ async def main():
|
||||||
error_log.write(completion + "\n")
|
error_log.write(completion + "\n")
|
||||||
|
|
||||||
print(n, time.time() - start, len(successful_completions))
|
print(n, time.time() - start, len(successful_completions))
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -64,6 +64,7 @@ class ProxyLogging:
|
||||||
litellm.callbacks.append(self.max_parallel_request_limiter)
|
litellm.callbacks.append(self.max_parallel_request_limiter)
|
||||||
litellm.callbacks.append(self.max_budget_limiter)
|
litellm.callbacks.append(self.max_budget_limiter)
|
||||||
litellm.callbacks.append(self.cache_control_check)
|
litellm.callbacks.append(self.cache_control_check)
|
||||||
|
litellm.success_callback.append(self.response_taking_too_long_callback)
|
||||||
for callback in litellm.callbacks:
|
for callback in litellm.callbacks:
|
||||||
if callback not in litellm.input_callback:
|
if callback not in litellm.input_callback:
|
||||||
litellm.input_callback.append(callback)
|
litellm.input_callback.append(callback)
|
||||||
|
@ -95,7 +96,11 @@ class ProxyLogging:
|
||||||
user_api_key_dict: UserAPIKeyAuth,
|
user_api_key_dict: UserAPIKeyAuth,
|
||||||
data: dict,
|
data: dict,
|
||||||
call_type: Literal[
|
call_type: Literal[
|
||||||
"completion", "embeddings", "image_generation", "moderation"
|
"completion",
|
||||||
|
"embeddings",
|
||||||
|
"image_generation",
|
||||||
|
"moderation",
|
||||||
|
"audio_transcription",
|
||||||
],
|
],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -142,6 +147,30 @@ class ProxyLogging:
|
||||||
raise e
|
raise e
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
async def response_taking_too_long_callback(
|
||||||
|
self,
|
||||||
|
kwargs, # kwargs to completion
|
||||||
|
completion_response, # response from completion
|
||||||
|
start_time,
|
||||||
|
end_time, # start/end time
|
||||||
|
):
|
||||||
|
if self.alerting is None:
|
||||||
|
return
|
||||||
|
time_difference = end_time - start_time
|
||||||
|
# Convert the timedelta to float (in seconds)
|
||||||
|
time_difference_float = time_difference.total_seconds()
|
||||||
|
litellm_params = kwargs.get("litellm_params", {})
|
||||||
|
api_base = litellm_params.get("api_base", "")
|
||||||
|
model = kwargs.get("model", "")
|
||||||
|
messages = kwargs.get("messages", "")
|
||||||
|
request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
|
||||||
|
slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
|
||||||
|
if time_difference_float > self.alerting_threshold:
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=slow_message + request_info,
|
||||||
|
level="Low",
|
||||||
|
)
|
||||||
|
|
||||||
async def response_taking_too_long(
|
async def response_taking_too_long(
|
||||||
self,
|
self,
|
||||||
start_time: Optional[float] = None,
|
start_time: Optional[float] = None,
|
||||||
|
@ -163,11 +192,11 @@ class ProxyLogging:
|
||||||
# try casting messages to str and get the first 100 characters, else mark as None
|
# try casting messages to str and get the first 100 characters, else mark as None
|
||||||
try:
|
try:
|
||||||
messages = str(messages)
|
messages = str(messages)
|
||||||
messages = messages[:10000]
|
messages = messages[:100]
|
||||||
except:
|
except:
|
||||||
messages = None
|
messages = None
|
||||||
|
|
||||||
request_info = f"\nRequest Model: {model}\nMessages: {messages}"
|
request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
|
||||||
else:
|
else:
|
||||||
request_info = ""
|
request_info = ""
|
||||||
|
|
||||||
|
@ -182,23 +211,13 @@ class ProxyLogging:
|
||||||
):
|
):
|
||||||
# only alert hanging responses if they have not been marked as success
|
# only alert hanging responses if they have not been marked as success
|
||||||
alerting_message = (
|
alerting_message = (
|
||||||
f"Requests are hanging - {self.alerting_threshold}s+ request time"
|
f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
|
||||||
)
|
)
|
||||||
await self.alerting_handler(
|
await self.alerting_handler(
|
||||||
message=alerting_message + request_info,
|
message=alerting_message + request_info,
|
||||||
level="Medium",
|
level="Medium",
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
|
||||||
type == "slow_response" and start_time is not None and end_time is not None
|
|
||||||
):
|
|
||||||
slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s"
|
|
||||||
if end_time - start_time > self.alerting_threshold:
|
|
||||||
await self.alerting_handler(
|
|
||||||
message=slow_message + request_info,
|
|
||||||
level="Low",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def budget_alerts(
|
async def budget_alerts(
|
||||||
self,
|
self,
|
||||||
type: Literal[
|
type: Literal[
|
||||||
|
@ -207,6 +226,7 @@ class ProxyLogging:
|
||||||
"user_and_proxy_budget",
|
"user_and_proxy_budget",
|
||||||
"failed_budgets",
|
"failed_budgets",
|
||||||
"failed_tracking",
|
"failed_tracking",
|
||||||
|
"projected_limit_exceeded",
|
||||||
],
|
],
|
||||||
user_max_budget: float,
|
user_max_budget: float,
|
||||||
user_current_spend: float,
|
user_current_spend: float,
|
||||||
|
@ -240,6 +260,23 @@ class ProxyLogging:
|
||||||
level="High",
|
level="High",
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
elif type == "projected_limit_exceeded" and user_info is not None:
|
||||||
|
"""
|
||||||
|
Input variables:
|
||||||
|
user_info = {
|
||||||
|
"key_alias": key_alias,
|
||||||
|
"projected_spend": projected_spend,
|
||||||
|
"projected_exceeded_date": projected_exceeded_date,
|
||||||
|
}
|
||||||
|
user_max_budget=soft_limit,
|
||||||
|
user_current_spend=new_spend
|
||||||
|
"""
|
||||||
|
message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
|
||||||
|
await self.alerting_handler(
|
||||||
|
message=message,
|
||||||
|
level="High",
|
||||||
|
)
|
||||||
|
return
|
||||||
else:
|
else:
|
||||||
user_info = str(user_info)
|
user_info = str(user_info)
|
||||||
# percent of max_budget left to spend
|
# percent of max_budget left to spend
|
||||||
|
@ -303,7 +340,7 @@ class ProxyLogging:
|
||||||
# Get the current timestamp
|
# Get the current timestamp
|
||||||
current_time = datetime.now().strftime("%H:%M:%S")
|
current_time = datetime.now().strftime("%H:%M:%S")
|
||||||
formatted_message = (
|
formatted_message = (
|
||||||
f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}"
|
f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
|
||||||
)
|
)
|
||||||
if self.alerting is None:
|
if self.alerting is None:
|
||||||
return
|
return
|
||||||
|
@ -329,7 +366,7 @@ class ProxyLogging:
|
||||||
else:
|
else:
|
||||||
raise Exception("Missing SENTRY_DSN from environment")
|
raise Exception("Missing SENTRY_DSN from environment")
|
||||||
|
|
||||||
async def failure_handler(self, original_exception):
|
async def failure_handler(self, original_exception, traceback_str=""):
|
||||||
"""
|
"""
|
||||||
Log failed db read/writes
|
Log failed db read/writes
|
||||||
|
|
||||||
|
@ -340,6 +377,8 @@ class ProxyLogging:
|
||||||
error_message = original_exception.detail
|
error_message = original_exception.detail
|
||||||
else:
|
else:
|
||||||
error_message = str(original_exception)
|
error_message = str(original_exception)
|
||||||
|
if isinstance(traceback_str, str):
|
||||||
|
error_message += traceback_str[:1000]
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.alerting_handler(
|
self.alerting_handler(
|
||||||
message=f"DB read/write call failed: {error_message}",
|
message=f"DB read/write call failed: {error_message}",
|
||||||
|
@ -477,7 +516,11 @@ class PrismaClient:
|
||||||
|
|
||||||
for k, v in db_data.items():
|
for k, v in db_data.items():
|
||||||
if isinstance(v, dict):
|
if isinstance(v, dict):
|
||||||
|
try:
|
||||||
db_data[k] = json.dumps(v)
|
db_data[k] = json.dumps(v)
|
||||||
|
except:
|
||||||
|
# This avoids Prisma retrying this 5 times, and making 5 clients
|
||||||
|
db_data[k] = "failed-to-serialize-json"
|
||||||
return db_data
|
return db_data
|
||||||
|
|
||||||
@backoff.on_exception(
|
@backoff.on_exception(
|
||||||
|
@ -654,6 +697,9 @@ class PrismaClient:
|
||||||
"""
|
"""
|
||||||
Generic implementation of get data
|
Generic implementation of get data
|
||||||
"""
|
"""
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"PrismaClient: get_generic_data: {key}, table_name: {table_name}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
if table_name == "users":
|
if table_name == "users":
|
||||||
response = await self.db.litellm_usertable.find_first(
|
response = await self.db.litellm_usertable.find_first(
|
||||||
|
@ -673,8 +719,15 @@ class PrismaClient:
|
||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception get_generic_data: {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -712,6 +765,10 @@ class PrismaClient:
|
||||||
int
|
int
|
||||||
] = None, # pagination, number of rows to getch when find_all==True
|
] = None, # pagination, number of rows to getch when find_all==True
|
||||||
):
|
):
|
||||||
|
args_passed_in = locals()
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"PrismaClient: get_data: token={token}, table_name: {table_name}, query_type: {query_type}, user_id: {user_id}, user_id_list: {user_id_list}, team_id: {team_id}, team_id_list: {team_id_list}, key_val: {key_val}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
response: Any = None
|
response: Any = None
|
||||||
if (token is not None and table_name is None) or (
|
if (token is not None and table_name is None) or (
|
||||||
|
@ -733,7 +790,8 @@ class PrismaClient:
|
||||||
detail={"error": f"No token passed in. Token={token}"},
|
detail={"error": f"No token passed in. Token={token}"},
|
||||||
)
|
)
|
||||||
response = await self.db.litellm_verificationtoken.find_unique(
|
response = await self.db.litellm_verificationtoken.find_unique(
|
||||||
where={"token": hashed_token}
|
where={"token": hashed_token},
|
||||||
|
include={"litellm_budget_table": True},
|
||||||
)
|
)
|
||||||
if response is not None:
|
if response is not None:
|
||||||
# for prisma we need to cast the expires time to str
|
# for prisma we need to cast the expires time to str
|
||||||
|
@ -741,9 +799,16 @@ class PrismaClient:
|
||||||
response.expires, datetime
|
response.expires, datetime
|
||||||
):
|
):
|
||||||
response.expires = response.expires.isoformat()
|
response.expires = response.expires.isoformat()
|
||||||
|
else:
|
||||||
|
# Token does not exist.
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail=f"Authentication Error: invalid user key - user key does not exist in db. User Key={token}",
|
||||||
|
)
|
||||||
elif query_type == "find_all" and user_id is not None:
|
elif query_type == "find_all" and user_id is not None:
|
||||||
response = await self.db.litellm_verificationtoken.find_many(
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
where={"user_id": user_id}
|
where={"user_id": user_id},
|
||||||
|
include={"litellm_budget_table": True},
|
||||||
)
|
)
|
||||||
if response is not None and len(response) > 0:
|
if response is not None and len(response) > 0:
|
||||||
for r in response:
|
for r in response:
|
||||||
|
@ -751,7 +816,8 @@ class PrismaClient:
|
||||||
r.expires = r.expires.isoformat()
|
r.expires = r.expires.isoformat()
|
||||||
elif query_type == "find_all" and team_id is not None:
|
elif query_type == "find_all" and team_id is not None:
|
||||||
response = await self.db.litellm_verificationtoken.find_many(
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
where={"team_id": team_id}
|
where={"team_id": team_id},
|
||||||
|
include={"litellm_budget_table": True},
|
||||||
)
|
)
|
||||||
if response is not None and len(response) > 0:
|
if response is not None and len(response) > 0:
|
||||||
for r in response:
|
for r in response:
|
||||||
|
@ -794,7 +860,9 @@ class PrismaClient:
|
||||||
hashed_tokens.append(t)
|
hashed_tokens.append(t)
|
||||||
where_filter["token"]["in"] = hashed_tokens
|
where_filter["token"]["in"] = hashed_tokens
|
||||||
response = await self.db.litellm_verificationtoken.find_many(
|
response = await self.db.litellm_verificationtoken.find_many(
|
||||||
order={"spend": "desc"}, where=where_filter # type: ignore
|
order={"spend": "desc"},
|
||||||
|
where=where_filter, # type: ignore
|
||||||
|
include={"litellm_budget_table": True},
|
||||||
)
|
)
|
||||||
if response is not None:
|
if response is not None:
|
||||||
return response
|
return response
|
||||||
|
@ -914,12 +982,21 @@ class PrismaClient:
|
||||||
)
|
)
|
||||||
|
|
||||||
sql_query = f"""
|
sql_query = f"""
|
||||||
SELECT *
|
SELECT
|
||||||
FROM "LiteLLM_VerificationTokenView"
|
v.*,
|
||||||
WHERE token = '{token}'
|
t.spend AS team_spend,
|
||||||
|
t.max_budget AS team_max_budget,
|
||||||
|
t.tpm_limit AS team_tpm_limit,
|
||||||
|
t.rpm_limit AS team_rpm_limit,
|
||||||
|
m.aliases as team_model_aliases
|
||||||
|
FROM "LiteLLM_VerificationToken" AS v
|
||||||
|
LEFT JOIN "LiteLLM_TeamTable" AS t ON v.team_id = t.team_id
|
||||||
|
LEFT JOIN "LiteLLM_ModelTable" m ON t.model_id = m.id
|
||||||
|
WHERE v.token = '{token}'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
response = await self.db.query_first(query=sql_query)
|
response = await self.db.query_first(query=sql_query)
|
||||||
|
|
||||||
if response is not None:
|
if response is not None:
|
||||||
response = LiteLLM_VerificationTokenView(**response)
|
response = LiteLLM_VerificationTokenView(**response)
|
||||||
# for prisma we need to cast the expires time to str
|
# for prisma we need to cast the expires time to str
|
||||||
|
@ -929,12 +1006,17 @@ class PrismaClient:
|
||||||
response.expires = response.expires.isoformat()
|
response.expires = response.expires.isoformat()
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
prisma_query_info = f"LiteLLM Prisma Client Exception: Error with `get_data`. Args passed in: {args_passed_in}"
|
||||||
|
error_msg = prisma_query_info + str(e)
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
|
verbose_proxy_logger.debug(error_traceback)
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -957,6 +1039,7 @@ class PrismaClient:
|
||||||
Add a key to the database. If it already exists, do nothing.
|
Add a key to the database. If it already exists, do nothing.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
verbose_proxy_logger.debug(f"PrismaClient: insert_data: {data}")
|
||||||
if table_name == "key":
|
if table_name == "key":
|
||||||
token = data["token"]
|
token = data["token"]
|
||||||
hashed_token = self.hash_token(token=token)
|
hashed_token = self.hash_token(token=token)
|
||||||
|
@ -1054,9 +1137,15 @@ class PrismaClient:
|
||||||
return new_user_notification_row
|
return new_user_notification_row
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM Prisma Client Exception: {e}")
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception in insert_data: {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1083,6 +1172,9 @@ class PrismaClient:
|
||||||
"""
|
"""
|
||||||
Update existing data
|
Update existing data
|
||||||
"""
|
"""
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"PrismaClient: update_data, table_name: {table_name}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
db_data = self.jsonify_object(data=data)
|
db_data = self.jsonify_object(data=data)
|
||||||
if update_key_values is not None:
|
if update_key_values is not None:
|
||||||
|
@ -1102,7 +1194,13 @@ class PrismaClient:
|
||||||
+ f"DB Token Table update succeeded {response}"
|
+ f"DB Token Table update succeeded {response}"
|
||||||
+ "\033[0m"
|
+ "\033[0m"
|
||||||
)
|
)
|
||||||
return {"token": token, "data": db_data}
|
_data: dict = {}
|
||||||
|
if response is not None:
|
||||||
|
try:
|
||||||
|
_data = response.model_dump() # type: ignore
|
||||||
|
except Exception as e:
|
||||||
|
_data = response.dict()
|
||||||
|
return {"token": token, "data": _data}
|
||||||
elif (
|
elif (
|
||||||
user_id is not None
|
user_id is not None
|
||||||
or (table_name is not None and table_name == "user")
|
or (table_name is not None and table_name == "user")
|
||||||
|
@ -1190,9 +1288,11 @@ class PrismaClient:
|
||||||
if t.token.startswith("sk-"): # type: ignore
|
if t.token.startswith("sk-"): # type: ignore
|
||||||
t.token = self.hash_token(token=t.token) # type: ignore
|
t.token = self.hash_token(token=t.token) # type: ignore
|
||||||
try:
|
try:
|
||||||
data_json = self.jsonify_object(data=t.model_dump())
|
data_json = self.jsonify_object(
|
||||||
|
data=t.model_dump(exclude_none=True)
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
data_json = self.jsonify_object(data=t.dict())
|
data_json = self.jsonify_object(data=t.dict(exclude_none=True))
|
||||||
batcher.litellm_verificationtoken.update(
|
batcher.litellm_verificationtoken.update(
|
||||||
where={"token": t.token}, # type: ignore
|
where={"token": t.token}, # type: ignore
|
||||||
data={**data_json}, # type: ignore
|
data={**data_json}, # type: ignore
|
||||||
|
@ -1231,10 +1331,16 @@ class PrismaClient:
|
||||||
"\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
|
"\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception - update_data: {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
print_verbose("\033[91m" + f"DB write failed: {e}" + "\033[0m")
|
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# Define a retrying strategy with exponential backoff
|
# Define a retrying strategy with exponential backoff
|
||||||
|
@ -1285,8 +1391,15 @@ class PrismaClient:
|
||||||
where={"team_id": {"in": team_id_list}}
|
where={"team_id": {"in": team_id_list}}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception - delete_data: {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1309,8 +1422,15 @@ class PrismaClient:
|
||||||
)
|
)
|
||||||
await self.db.connect()
|
await self.db.connect()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception connect(): {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1326,8 +1446,15 @@ class PrismaClient:
|
||||||
try:
|
try:
|
||||||
await self.db.disconnect()
|
await self.db.disconnect()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
error_msg = f"LiteLLM Prisma Client Exception disconnect(): {str(e)}"
|
||||||
|
print_verbose(error_msg)
|
||||||
|
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.proxy_logging_obj.failure_handler(original_exception=e)
|
self.proxy_logging_obj.failure_handler(
|
||||||
|
original_exception=e, traceback_str=error_traceback
|
||||||
|
)
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -1550,10 +1677,28 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
||||||
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
|
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
|
||||||
# hash the api_key
|
# hash the api_key
|
||||||
api_key = hash_token(api_key)
|
api_key = hash_token(api_key)
|
||||||
if "headers" in metadata and "authorization" in metadata["headers"]:
|
|
||||||
metadata["headers"].pop(
|
# clean up litellm metadata
|
||||||
"authorization"
|
if isinstance(metadata, dict):
|
||||||
) # do not store the original `sk-..` api key in the db
|
clean_metadata = {}
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
f"getting payload for SpendLogs, available keys in metadata: "
|
||||||
|
+ str(list(metadata.keys()))
|
||||||
|
)
|
||||||
|
for key in metadata:
|
||||||
|
if key in [
|
||||||
|
"headers",
|
||||||
|
"endpoint",
|
||||||
|
"model_group",
|
||||||
|
"deployment",
|
||||||
|
"model_info",
|
||||||
|
"caching_groups",
|
||||||
|
"previous_models",
|
||||||
|
]:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
clean_metadata[key] = metadata[key]
|
||||||
|
|
||||||
if litellm.cache is not None:
|
if litellm.cache is not None:
|
||||||
cache_key = litellm.cache.get_cache_key(**kwargs)
|
cache_key = litellm.cache.get_cache_key(**kwargs)
|
||||||
else:
|
else:
|
||||||
|
@ -1577,7 +1722,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
||||||
"team_id": kwargs.get("litellm_params", {})
|
"team_id": kwargs.get("litellm_params", {})
|
||||||
.get("metadata", {})
|
.get("metadata", {})
|
||||||
.get("user_api_key_team_id", ""),
|
.get("user_api_key_team_id", ""),
|
||||||
"metadata": metadata,
|
"metadata": clean_metadata,
|
||||||
"cache_key": cache_key,
|
"cache_key": cache_key,
|
||||||
"spend": kwargs.get("response_cost", 0),
|
"spend": kwargs.get("response_cost", 0),
|
||||||
"total_tokens": usage.get("total_tokens", 0),
|
"total_tokens": usage.get("total_tokens", 0),
|
||||||
|
@ -1585,6 +1730,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
||||||
"completion_tokens": usage.get("completion_tokens", 0),
|
"completion_tokens": usage.get("completion_tokens", 0),
|
||||||
"request_tags": metadata.get("tags", []),
|
"request_tags": metadata.get("tags", []),
|
||||||
"end_user": kwargs.get("user", ""),
|
"end_user": kwargs.get("user", ""),
|
||||||
|
"api_base": litellm_params.get("api_base", ""),
|
||||||
}
|
}
|
||||||
|
|
||||||
verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")
|
verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")
|
||||||
|
@ -1712,6 +1858,69 @@ async def _read_request_body(request):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_projected_spend_over_limit(
|
||||||
|
current_spend: float, soft_budget_limit: Optional[float]
|
||||||
|
):
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
if soft_budget_limit is None:
|
||||||
|
# If there's no limit, we can't exceed it.
|
||||||
|
return False
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
|
||||||
|
# Finding the first day of the next month, then subtracting one day to get the end of the current month.
|
||||||
|
if today.month == 12: # December edge case
|
||||||
|
end_month = date(today.year + 1, 1, 1) - timedelta(days=1)
|
||||||
|
else:
|
||||||
|
end_month = date(today.year, today.month + 1, 1) - timedelta(days=1)
|
||||||
|
|
||||||
|
remaining_days = (end_month - today).days
|
||||||
|
|
||||||
|
# Check for the start of the month to avoid division by zero
|
||||||
|
if today.day == 1:
|
||||||
|
daily_spend_estimate = current_spend
|
||||||
|
else:
|
||||||
|
daily_spend_estimate = current_spend / (today.day - 1)
|
||||||
|
|
||||||
|
# Total projected spend for the month
|
||||||
|
projected_spend = current_spend + (daily_spend_estimate * remaining_days)
|
||||||
|
|
||||||
|
if projected_spend > soft_budget_limit:
|
||||||
|
print_verbose("Projected spend exceeds soft budget limit!")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _get_projected_spend_over_limit(
|
||||||
|
current_spend: float, soft_budget_limit: Optional[float]
|
||||||
|
) -> Optional[tuple]:
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
if soft_budget_limit is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
today = datetime.date.today()
|
||||||
|
end_month = datetime.date(today.year, today.month + 1, 1) - datetime.timedelta(
|
||||||
|
days=1
|
||||||
|
)
|
||||||
|
remaining_days = (end_month - today).days
|
||||||
|
|
||||||
|
daily_spend = current_spend / (
|
||||||
|
today.day - 1
|
||||||
|
) # assuming the current spend till today (not including today)
|
||||||
|
projected_spend = daily_spend * remaining_days
|
||||||
|
|
||||||
|
if projected_spend > soft_budget_limit:
|
||||||
|
approx_days = soft_budget_limit / daily_spend
|
||||||
|
limit_exceed_date = today + datetime.timedelta(days=approx_days)
|
||||||
|
|
||||||
|
# return the projected spend and the date it will exceeded
|
||||||
|
return projected_spend, limit_exceed_date
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_valid_team_configs(team_id=None, team_config=None, request_data=None):
|
def _is_valid_team_configs(team_id=None, team_config=None, request_data=None):
|
||||||
if team_id is None or team_config is None or request_data is None:
|
if team_id is None or team_config is None or request_data is None:
|
||||||
return
|
return
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue