mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
Merge branch 'main' into main
This commit is contained in:
commit
78303b79ee
124 changed files with 6716 additions and 1078 deletions
|
@ -40,7 +40,7 @@ jobs:
|
||||||
pip install "aioboto3==12.3.0"
|
pip install "aioboto3==12.3.0"
|
||||||
pip install langchain
|
pip install langchain
|
||||||
pip install lunary==0.2.5
|
pip install lunary==0.2.5
|
||||||
pip install "langfuse==2.7.3"
|
pip install "langfuse==2.27.1"
|
||||||
pip install numpydoc
|
pip install numpydoc
|
||||||
pip install traceloop-sdk==0.0.69
|
pip install traceloop-sdk==0.0.69
|
||||||
pip install openai
|
pip install openai
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -51,3 +51,4 @@ loadtest_kub.yaml
|
||||||
litellm/proxy/_new_secret_config.yaml
|
litellm/proxy/_new_secret_config.yaml
|
||||||
litellm/proxy/_new_secret_config.yaml
|
litellm/proxy/_new_secret_config.yaml
|
||||||
litellm/proxy/_super_secret_config.yaml
|
litellm/proxy/_super_secret_config.yaml
|
||||||
|
litellm/proxy/_super_secret_config.yaml
|
||||||
|
|
|
@ -7,7 +7,7 @@ repos:
|
||||||
rev: 7.0.0 # The version of flake8 to use
|
rev: 7.0.0 # The version of flake8 to use
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
|
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
|
||||||
additional_dependencies: [flake8-print]
|
additional_dependencies: [flake8-print]
|
||||||
files: litellm/.*\.py
|
files: litellm/.*\.py
|
||||||
- repo: local
|
- repo: local
|
||||||
|
|
|
@ -227,6 +227,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
|
||||||
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
|
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
|
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
|
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
|
||||||
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
|
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
|
||||||
| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ |
|
| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ |
|
||||||
|
|
||||||
|
|
300
cookbook/liteLLM_IBM_Watsonx.ipynb
vendored
Normal file
300
cookbook/liteLLM_IBM_Watsonx.ipynb
vendored
Normal file
File diff suppressed because one or more lines are too long
|
@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
|
||||||
response = completion("command-nightly", messages)
|
response = completion("command-nightly", messages)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## JSON Logs
|
||||||
|
|
||||||
|
If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
|
||||||
|
|
||||||
|
We currently just log the raw POST request from litellm as a JSON - [**See Code**].
|
||||||
|
|
||||||
|
[Share feedback here](https://github.com/BerriAI/litellm/issues)
|
||||||
|
|
||||||
## Logger Function
|
## Logger Function
|
||||||
But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set?
|
But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set?
|
||||||
|
|
||||||
|
|
|
@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Multi-Instance TPM/RPM Load Test (Router)
|
||||||
|
|
||||||
|
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
|
||||||
|
|
||||||
|
In our test:
|
||||||
|
- Max RPM per deployment is = 100 requests per minute
|
||||||
|
- Max Throughput / min on router = 200 requests per minute (2 deployments)
|
||||||
|
- Load we'll send through router = 600 requests per minute
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Code
|
||||||
|
|
||||||
|
Let's hit the router with 600 requests per minute.
|
||||||
|
|
||||||
|
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import litellm
|
||||||
|
litellm.suppress_debug_info = True
|
||||||
|
litellm.set_verbose = False
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
import os, random, uuid, time, asyncio
|
||||||
|
|
||||||
|
# Model list for OpenAI and Anthropic models
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "fake-openai-endpoint",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "my-fake-key",
|
||||||
|
"api_base": "http://0.0.0.0:8080",
|
||||||
|
"rpm": 100
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "fake-openai-endpoint",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "my-fake-key",
|
||||||
|
"api_base": "http://0.0.0.0:8081",
|
||||||
|
"rpm": 100
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
||||||
|
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def router_completion_non_streaming():
|
||||||
|
try:
|
||||||
|
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
|
||||||
|
# print(f"client={client}")
|
||||||
|
response = await client.acompletion(
|
||||||
|
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def loadtest_fn():
|
||||||
|
start = time.time()
|
||||||
|
n = 600 # Number of concurrent tasks
|
||||||
|
tasks = [router_completion_non_streaming() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
|
||||||
|
def get_utc_datetime():
|
||||||
|
import datetime as dt
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if hasattr(dt, "UTC"):
|
||||||
|
return datetime.now(dt.UTC) # type: ignore
|
||||||
|
else:
|
||||||
|
return datetime.utcnow() # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
# Run the event loop to execute the async function
|
||||||
|
async def parent_fn():
|
||||||
|
for _ in range(10):
|
||||||
|
dt = get_utc_datetime()
|
||||||
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
print(f"triggered new batch - {current_minute}")
|
||||||
|
await loadtest_fn()
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
asyncio.run(parent_fn())
|
||||||
|
```
|
||||||
|
## Multi-Instance TPM/RPM Load Test (Proxy)
|
||||||
|
|
||||||
|
Test if your defined tpm/rpm limits are respected across multiple instances.
|
||||||
|
|
||||||
|
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
|
||||||
|
|
||||||
|
In our test:
|
||||||
|
- Max RPM per deployment is = 100 requests per minute
|
||||||
|
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
|
||||||
|
- Load we'll send to proxy = 600 requests per minute
|
||||||
|
|
||||||
|
|
||||||
|
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### 1. Setup config
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- litellm_params:
|
||||||
|
api_base: http://0.0.0.0:8080
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model
|
||||||
|
rpm: 100
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
- litellm_params:
|
||||||
|
api_base: http://0.0.0.0:8081
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model-2
|
||||||
|
rpm: 100
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
router_settings:
|
||||||
|
num_retries: 0
|
||||||
|
enable_pre_call_checks: true
|
||||||
|
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
|
||||||
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
|
redis_port: os.environ/REDIS_PORT
|
||||||
|
routing_strategy: usage-based-routing-v2
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start proxy 2 instances
|
||||||
|
|
||||||
|
**Instance 1**
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml --port 4000
|
||||||
|
|
||||||
|
## RUNNING on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Instance 2**
|
||||||
|
```bash
|
||||||
|
litellm --config /path/to/config.yaml --port 4001
|
||||||
|
|
||||||
|
## RUNNING on http://0.0.0.0:4001
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run Test
|
||||||
|
|
||||||
|
Let's hit the proxy with 600 requests per minute.
|
||||||
|
|
||||||
|
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
|
import random, uuid
|
||||||
|
import time, asyncio, litellm
|
||||||
|
# import logging
|
||||||
|
# logging.basicConfig(level=logging.DEBUG)
|
||||||
|
#### LITELLM PROXY ####
|
||||||
|
litellm_client = AsyncOpenAI(
|
||||||
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
litellm_client_2 = AsyncOpenAI(
|
||||||
|
api_key="sk-1234", # [CHANGE THIS]
|
||||||
|
base_url="http://0.0.0.0:4001"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def proxy_completion_non_streaming():
|
||||||
|
try:
|
||||||
|
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
|
||||||
|
# print(f"client={client}")
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def loadtest_fn():
|
||||||
|
start = time.time()
|
||||||
|
n = 600 # Number of concurrent tasks
|
||||||
|
tasks = [proxy_completion_non_streaming() for _ in range(n)]
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
|
||||||
|
def get_utc_datetime():
|
||||||
|
import datetime as dt
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if hasattr(dt, "UTC"):
|
||||||
|
return datetime.now(dt.UTC) # type: ignore
|
||||||
|
else:
|
||||||
|
return datetime.utcnow() # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
# Run the event loop to execute the async function
|
||||||
|
async def parent_fn():
|
||||||
|
for _ in range(10):
|
||||||
|
dt = get_utc_datetime()
|
||||||
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
print(f"triggered new batch - {current_minute}")
|
||||||
|
await loadtest_fn()
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
asyncio.run(parent_fn())
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Extra - Setup Fake OpenAI Server
|
||||||
|
|
||||||
|
Let's setup a fake openai server with a RPM limit of 100.
|
||||||
|
|
||||||
|
Let's call our file `fake_openai_server.py`.
|
||||||
|
|
||||||
|
```
|
||||||
|
# import sys, os
|
||||||
|
# sys.path.insert(
|
||||||
|
# 0, os.path.abspath("../")
|
||||||
|
# ) # Adds the parent directory to the system path
|
||||||
|
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
|
||||||
|
import httpx, os, json
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from typing import Optional
|
||||||
|
from slowapi import Limiter
|
||||||
|
from slowapi.util import get_remote_address
|
||||||
|
from slowapi.errors import RateLimitExceeded
|
||||||
|
from fastapi import FastAPI, Request, HTTPException
|
||||||
|
from fastapi.responses import PlainTextResponse
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyException(Exception):
|
||||||
|
# NOTE: DO NOT MODIFY THIS
|
||||||
|
# This is used to map exactly to OPENAI Exceptions
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
type: str,
|
||||||
|
param: Optional[str],
|
||||||
|
code: Optional[int],
|
||||||
|
):
|
||||||
|
self.message = message
|
||||||
|
self.type = type
|
||||||
|
self.param = param
|
||||||
|
self.code = code
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""Converts the ProxyException instance to a dictionary."""
|
||||||
|
return {
|
||||||
|
"message": self.message,
|
||||||
|
"type": self.type,
|
||||||
|
"param": self.param,
|
||||||
|
"code": self.code,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
limiter = Limiter(key_func=get_remote_address)
|
||||||
|
app = FastAPI()
|
||||||
|
app.state.limiter = limiter
|
||||||
|
|
||||||
|
@app.exception_handler(RateLimitExceeded)
|
||||||
|
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
||||||
|
return JSONResponse(status_code=429,
|
||||||
|
content={"detail": "Rate Limited!"})
|
||||||
|
|
||||||
|
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# for completion
|
||||||
|
@app.post("/chat/completions")
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
@limiter.limit("100/minute")
|
||||||
|
async def completion(request: Request):
|
||||||
|
# raise HTTPException(status_code=429, detail="Rate Limited!")
|
||||||
|
return {
|
||||||
|
"id": "chatcmpl-123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1677652288,
|
||||||
|
"model": None,
|
||||||
|
"system_fingerprint": "fp_44709d6fcb",
|
||||||
|
"choices": [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "\n\nHello there, how may I assist you today?",
|
||||||
|
},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop"
|
||||||
|
}],
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 9,
|
||||||
|
"completion_tokens": 12,
|
||||||
|
"total_tokens": 21
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import socket
|
||||||
|
import uvicorn
|
||||||
|
port = 8080
|
||||||
|
while True:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
result = sock.connect_ex(('0.0.0.0', port))
|
||||||
|
if result != 0:
|
||||||
|
print(f"Port {port} is available, starting server...")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
port += 1
|
||||||
|
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 fake_openai_server.py
|
||||||
|
```
|
||||||
|
|
|
@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
### Custom Callback to track costs for Streaming + Non-Streaming
|
### Custom Callback to track costs for Streaming + Non-Streaming
|
||||||
|
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
|
||||||
```python
|
```python
|
||||||
|
|
||||||
|
# Step 1. Write your custom callback function
|
||||||
def track_cost_callback(
|
def track_cost_callback(
|
||||||
kwargs, # kwargs to completion
|
kwargs, # kwargs to completion
|
||||||
completion_response, # response from completion
|
completion_response, # response from completion
|
||||||
start_time, end_time # start/end time
|
start_time, end_time # start/end time
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
# init logging config
|
response_cost = kwargs["response_cost"] # litellm calculates response cost for you
|
||||||
logging.basicConfig(
|
|
||||||
filename='cost.log',
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(message)s',
|
|
||||||
datefmt='%Y-%m-%d %H:%M:%S'
|
|
||||||
)
|
|
||||||
|
|
||||||
# check if it has collected an entire stream response
|
|
||||||
if "complete_streaming_response" in kwargs:
|
|
||||||
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
|
|
||||||
completion_response=kwargs["complete_streaming_response"]
|
|
||||||
input_text = kwargs["messages"]
|
|
||||||
output_text = completion_response["choices"][0]["message"]["content"]
|
|
||||||
response_cost = litellm.completion_cost(
|
|
||||||
model = kwargs["model"],
|
|
||||||
messages = input_text,
|
|
||||||
completion=output_text
|
|
||||||
)
|
|
||||||
print("streaming response_cost", response_cost)
|
|
||||||
logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
|
|
||||||
|
|
||||||
# for non streaming responses
|
|
||||||
else:
|
|
||||||
# we pass the completion_response obj
|
|
||||||
if kwargs["stream"] != True:
|
|
||||||
response_cost = litellm.completion_cost(completion_response=completion_response)
|
|
||||||
print("regular response_cost", response_cost)
|
print("regular response_cost", response_cost)
|
||||||
logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Assign the custom callback function
|
# Step 2. Assign the custom callback function
|
||||||
litellm.success_callback = [track_cost_callback]
|
litellm.success_callback = [track_cost_callback]
|
||||||
|
|
||||||
|
# Step 3. Make litellm.completion call
|
||||||
response = completion(
|
response = completion(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[
|
messages=[
|
||||||
|
|
|
@ -121,10 +121,12 @@ response = completion(
|
||||||
metadata={
|
metadata={
|
||||||
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
|
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
|
||||||
"generation_id": "gen-id22", # set langfuse Generation ID
|
"generation_id": "gen-id22", # set langfuse Generation ID
|
||||||
"trace_id": "trace-id22", # set langfuse Trace ID
|
|
||||||
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
||||||
"session_id": "session-1", # set langfuse Session ID
|
"session_id": "session-1", # set langfuse Session ID
|
||||||
"tags": ["tag1", "tag2"] # set langfuse Tags
|
"tags": ["tag1", "tag2"] # set langfuse Tags
|
||||||
|
"trace_id": "trace-id22", # set langfuse Trace ID
|
||||||
|
### OR ###
|
||||||
|
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -167,6 +169,9 @@ messages = [
|
||||||
chat(messages)
|
chat(messages)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Redacting Messages, Response Content from Langfuse Logging
|
||||||
|
|
||||||
|
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
|
||||||
|
|
||||||
## Troubleshooting & Errors
|
## Troubleshooting & Errors
|
||||||
### Data not getting logged to Langfuse ?
|
### Data not getting logged to Langfuse ?
|
||||||
|
|
97
docs/my-website/docs/observability/openmeter.md
Normal file
97
docs/my-website/docs/observability/openmeter.md
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# OpenMeter - Usage-Based Billing
|
||||||
|
|
||||||
|
[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
|
||||||
|
|
||||||
|
<Image img={require('../../img/openmeter.png')} />
|
||||||
|
|
||||||
|
:::info
|
||||||
|
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
|
||||||
|
join our [discord](https://discord.gg/wuPM9dRgDw)
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
|
||||||
|
|
||||||
|
Get your OpenMeter API Key from https://openmeter.cloud/meters
|
||||||
|
|
||||||
|
```python
|
||||||
|
litellm.success_callback = ["openmeter"] # logs cost + usage of successful calls to openmeter
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="sdk" label="SDK">
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pip install langfuse
|
||||||
|
import litellm
|
||||||
|
import os
|
||||||
|
|
||||||
|
# from https://openmeter.cloud
|
||||||
|
os.environ["OPENMETER_API_ENDPOINT"] = ""
|
||||||
|
os.environ["OPENMETER_API_KEY"] = ""
|
||||||
|
|
||||||
|
# LLM API Keys
|
||||||
|
os.environ['OPENAI_API_KEY']=""
|
||||||
|
|
||||||
|
# set langfuse as a callback, litellm will send the data to langfuse
|
||||||
|
litellm.success_callback = ["openmeter"]
|
||||||
|
|
||||||
|
# openai call
|
||||||
|
response = litellm.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Hi 👋 - i'm openai"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="proxy" label="PROXY">
|
||||||
|
|
||||||
|
1. Add to Config.yaml
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- litellm_params:
|
||||||
|
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
success_callback: ["openmeter"] # 👈 KEY CHANGE
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start Proxy
|
||||||
|
|
||||||
|
```
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Test it!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "fake-openai-endpoint",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
<Image img={require('../../img/openmeter_img_2.png')} />
|
|
@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Redacting Messages, Response Content from Sentry Logging
|
||||||
|
|
||||||
|
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
|
||||||
|
|
||||||
[Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry.
|
[Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry.
|
||||||
|
|
||||||
|
|
|
@ -53,6 +53,50 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
|
||||||
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` |
|
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` |
|
||||||
|
|
||||||
|
|
||||||
|
## Function Calling
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
# set env
|
||||||
|
os.environ["MISTRAL_API_KEY"] = "your-api-key"
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="mistral/mistral-large-latest",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
# Add any assertions, here to check response args
|
||||||
|
print(response)
|
||||||
|
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
|
||||||
|
assert isinstance(
|
||||||
|
response.choices[0].message.tool_calls[0].function.arguments, str
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
## Sample Usage - Embedding
|
## Sample Usage - Embedding
|
||||||
```python
|
```python
|
||||||
from litellm import embedding
|
from litellm import embedding
|
||||||
|
|
|
@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.
|
||||||
|
|
||||||
🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
|
🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
|
||||||
|
|
||||||
|
|
||||||
|
:::info
|
||||||
|
|
||||||
|
To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
### Quick Start
|
### Quick Start
|
||||||
```
|
```
|
||||||
pip install litellm vllm
|
pip install litellm vllm
|
||||||
|
|
284
docs/my-website/docs/providers/watsonx.md
Normal file
284
docs/my-website/docs/providers/watsonx.md
Normal file
|
@ -0,0 +1,284 @@
|
||||||
|
import Tabs from '@theme/Tabs';
|
||||||
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
|
# IBM watsonx.ai
|
||||||
|
|
||||||
|
LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
```python
|
||||||
|
os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance
|
||||||
|
# (required) either one of the following:
|
||||||
|
os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
|
||||||
|
os.environ["WATSONX_TOKEN"] = "" # IAM auth token
|
||||||
|
# optional - can also be passed as params to completion() or embedding()
|
||||||
|
os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
|
||||||
|
os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
|
||||||
|
```
|
||||||
|
|
||||||
|
See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
|
||||||
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["WATSONX_URL"] = ""
|
||||||
|
os.environ["WATSONX_APIKEY"] = ""
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="watsonx/ibm/granite-13b-chat-v2",
|
||||||
|
messages=[{ "content": "what is your favorite colour?","role": "user"}],
|
||||||
|
project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
|
||||||
|
)
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="watsonx/meta-llama/llama-3-8b-instruct",
|
||||||
|
messages=[{ "content": "what is your favorite colour?","role": "user"}],
|
||||||
|
project_id="<my-project-id>"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage - Streaming
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
os.environ["WATSONX_URL"] = ""
|
||||||
|
os.environ["WATSONX_APIKEY"] = ""
|
||||||
|
os.environ["WATSONX_PROJECT_ID"] = ""
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="watsonx/ibm/granite-13b-chat-v2",
|
||||||
|
messages=[{ "content": "what is your favorite colour?","role": "user"}],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Example Streaming Output Chunk
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": null,
|
||||||
|
"index": 0,
|
||||||
|
"delta": {
|
||||||
|
"content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": null,
|
||||||
|
"model": "watsonx/ibm/granite-13b-chat-v2",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": null,
|
||||||
|
"completion_tokens": null,
|
||||||
|
"total_tokens": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage - Models in deployment spaces
|
||||||
|
|
||||||
|
Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space).
|
||||||
|
|
||||||
|
The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import litellm
|
||||||
|
response = litellm.completion(
|
||||||
|
model="watsonx/deployment/<deployment_id>",
|
||||||
|
messages=[{"content": "Hello, how are you?", "role": "user"}],
|
||||||
|
space_id="<deployment_space_id>"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage - Embeddings
|
||||||
|
|
||||||
|
LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import embedding
|
||||||
|
|
||||||
|
response = embedding(
|
||||||
|
model="watsonx/ibm/slate-30m-english-rtrvr",
|
||||||
|
input=["What is the capital of France?"],
|
||||||
|
project_id="<my-project-id>"
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
|
||||||
|
```
|
||||||
|
|
||||||
|
## OpenAI Proxy Usage
|
||||||
|
|
||||||
|
Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
|
||||||
|
|
||||||
|
### 1. Save keys in your environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export WATSONX_URL=""
|
||||||
|
export WATSONX_APIKEY=""
|
||||||
|
export WATSONX_PROJECT_ID=""
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start the proxy
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="cli" label="CLI">
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
|
||||||
|
|
||||||
|
# Server running on http://0.0.0.0:4000
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="config" label="config.yaml">
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: llama-3-8b
|
||||||
|
litellm_params:
|
||||||
|
# all params accepted by litellm.completion()
|
||||||
|
model: watsonx/meta-llama/llama-3-8b-instruct
|
||||||
|
api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
### 3. Test it
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="Curl" label="Curl Request">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "llama-3-8b",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what is your favorite colour?"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything",
|
||||||
|
base_url="http://0.0.0.0:4000"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request sent to model set on litellm proxy, `litellm --model`
|
||||||
|
response = client.chat.completions.create(model="llama-3-8b", messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what is your favorite colour?"
|
||||||
|
}
|
||||||
|
])
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="langchain" label="Langchain">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.prompts.chat import (
|
||||||
|
ChatPromptTemplate,
|
||||||
|
HumanMessagePromptTemplate,
|
||||||
|
SystemMessagePromptTemplate,
|
||||||
|
)
|
||||||
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
|
||||||
|
chat = ChatOpenAI(
|
||||||
|
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
|
||||||
|
model = "llama-3-8b",
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
SystemMessage(
|
||||||
|
content="You are a helpful assistant that im using to make a test request to."
|
||||||
|
),
|
||||||
|
HumanMessage(
|
||||||
|
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
response = chat(messages)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
### Passing credentials as parameters
|
||||||
|
|
||||||
|
You can also pass the credentials as parameters to the completion and embedding functions.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
response = completion(
|
||||||
|
model="watsonx/ibm/granite-13b-chat-v2",
|
||||||
|
messages=[{ "content": "What is your favorite color?","role": "user"}],
|
||||||
|
url="",
|
||||||
|
api_key="",
|
||||||
|
project_id=""
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Supported IBM watsonx.ai Models
|
||||||
|
|
||||||
|
Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
|
||||||
|
|
||||||
|
| Mode Name | Command |
|
||||||
|
| ---------- | --------- |
|
||||||
|
| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
|
||||||
|
| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
|
||||||
|
| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
|
||||||
|
| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
|
||||||
|
| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
|
||||||
|
| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
|
||||||
|
| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
|
||||||
|
| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
|
||||||
|
| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
|
||||||
|
| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
|
||||||
|
| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
|
||||||
|
| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
|
||||||
|
| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
|
||||||
|
| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
|
||||||
|
| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
|
||||||
|
|
||||||
|
|
||||||
|
For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
|
||||||
|
|
||||||
|
|
||||||
|
## Supported IBM watsonx.ai Embedding Models
|
||||||
|
|
||||||
|
| Model Name | Function Call |
|
||||||
|
|----------------------|---------------------------------------------|
|
||||||
|
| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
|
||||||
|
| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
|
||||||
|
|
||||||
|
|
||||||
|
For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).
|
|
@ -1,13 +1,13 @@
|
||||||
# Slack Alerting
|
# 🚨 Alerting
|
||||||
|
|
||||||
Get alerts for:
|
Get alerts for:
|
||||||
- hanging LLM api calls
|
- Hanging LLM api calls
|
||||||
- failed LLM api calls
|
- Failed LLM api calls
|
||||||
- slow LLM api calls
|
- Slow LLM api calls
|
||||||
- budget Tracking per key/user:
|
- Budget Tracking per key/user:
|
||||||
- When a User/Key crosses their Budget
|
- When a User/Key crosses their Budget
|
||||||
- When a User/Key is 15% away from crossing their Budget
|
- When a User/Key is 15% away from crossing their Budget
|
||||||
- failed db read/writes
|
- Failed db read/writes
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
|
|
@ -62,9 +62,11 @@ model_list:
|
||||||
|
|
||||||
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
|
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
|
||||||
drop_params: True
|
drop_params: True
|
||||||
|
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
|
||||||
|
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||||
|
alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
|
||||||
```
|
```
|
||||||
:::info
|
:::info
|
||||||
|
|
||||||
|
|
|
@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber
|
||||||
|
|
||||||
<TabItem value="basic" label="Basic">
|
<TabItem value="basic" label="Basic">
|
||||||
|
|
||||||
**Step 1. Create a file called `litellm_config.yaml`**
|
### Step 1. CREATE config.yaml
|
||||||
|
|
||||||
Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
|
Example `litellm_config.yaml`
|
||||||
```yaml
|
|
||||||
model_list:
|
```yaml
|
||||||
|
model_list:
|
||||||
- model_name: azure-gpt-3.5
|
- model_name: azure-gpt-3.5
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: azure/<your-azure-model-deployment>
|
model: azure/<your-azure-model-deployment>
|
||||||
api_base: os.environ/AZURE_API_BASE
|
api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
|
||||||
api_key: os.environ/AZURE_API_KEY
|
api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
|
||||||
api_version: "2023-07-01-preview"
|
api_version: "2023-07-01-preview"
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 2. Run litellm docker image**
|
|
||||||
|
|
||||||
See the latest available ghcr docker image here:
|
|
||||||
https://github.com/berriai/litellm/pkgs/container/litellm
|
|
||||||
|
|
||||||
Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command.
|
### Step 2. RUN Docker Image
|
||||||
The `-v` command will mount that file
|
|
||||||
|
|
||||||
Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
|
```shell
|
||||||
|
docker run \
|
||||||
```shell
|
|
||||||
docker run \
|
|
||||||
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
|
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
|
||||||
-e AZURE_API_KEY=d6*********** \
|
-e AZURE_API_KEY=d6*********** \
|
||||||
-e AZURE_API_BASE=https://openai-***********/ \
|
-e AZURE_API_BASE=https://openai-***********/ \
|
||||||
-p 4000:4000 \
|
-p 4000:4000 \
|
||||||
ghcr.io/berriai/litellm:main-latest \
|
ghcr.io/berriai/litellm:main-latest \
|
||||||
--config /app/config.yaml --detailed_debug
|
--config /app/config.yaml --detailed_debug
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 3. Send a Test Request**
|
Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
|
||||||
|
|
||||||
|
### Step 3. TEST Request
|
||||||
|
|
||||||
Pass `model=azure-gpt-3.5` this was set on step 1
|
Pass `model=azure-gpt-3.5` this was set on step 1
|
||||||
|
|
||||||
|
@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
|
||||||
| Docs | When to Use |
|
| Docs | When to Use |
|
||||||
| --- | --- |
|
| --- | --- |
|
||||||
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
|
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
|
||||||
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
|
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
|
||||||
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
|
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
|
||||||
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
|
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
|
||||||
|
|
||||||
## Deploy with Database
|
## Deploy with Database
|
||||||
### Docker, Kubernetes, Helm Chart
|
### Docker, Kubernetes, Helm Chart
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
|
||||||
|
- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
|
|
||||||
|
@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
|
||||||
```shell
|
```shell
|
||||||
docker run \
|
docker run \
|
||||||
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
|
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
|
||||||
|
-e LITELLM_MASTER_KEY=sk-1234 \
|
||||||
|
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
|
||||||
-e AZURE_API_KEY=d6*********** \
|
-e AZURE_API_KEY=d6*********** \
|
||||||
-e AZURE_API_BASE=https://openai-***********/ \
|
-e AZURE_API_BASE=https://openai-***********/ \
|
||||||
-p 4000:4000 \
|
-p 4000:4000 \
|
||||||
|
@ -267,12 +269,12 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
#### Step 1. Create deployment.yaml
|
#### Step 1. Create deployment.yaml
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
name: litellm-deployment
|
name: litellm-deployment
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 3
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app: litellm
|
app: litellm
|
||||||
|
@ -283,10 +285,47 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: litellm-container
|
- name: litellm-container
|
||||||
image: ghcr.io/berriai/litellm-database:main-latest
|
image: ghcr.io/berriai/litellm:main-latest
|
||||||
|
imagePullPolicy: Always
|
||||||
env:
|
env:
|
||||||
|
- name: AZURE_API_KEY
|
||||||
|
value: "d6******"
|
||||||
|
- name: AZURE_API_BASE
|
||||||
|
value: "https://ope******"
|
||||||
|
- name: LITELLM_MASTER_KEY
|
||||||
|
value: "sk-1234"
|
||||||
- name: DATABASE_URL
|
- name: DATABASE_URL
|
||||||
value: postgresql://<user>:<password>@<host>:<port>/<dbname>
|
value: "po**********"
|
||||||
|
args:
|
||||||
|
- "--config"
|
||||||
|
- "/app/proxy_config.yaml" # Update the path to mount the config file
|
||||||
|
volumeMounts: # Define volume mount for proxy_config.yaml
|
||||||
|
- name: config-volume
|
||||||
|
mountPath: /app
|
||||||
|
readOnly: true
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health/liveliness
|
||||||
|
port: 4000
|
||||||
|
initialDelaySeconds: 120
|
||||||
|
periodSeconds: 15
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
timeoutSeconds: 10
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health/readiness
|
||||||
|
port: 4000
|
||||||
|
initialDelaySeconds: 120
|
||||||
|
periodSeconds: 15
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
timeoutSeconds: 10
|
||||||
|
volumes: # Define volume to mount proxy_config.yaml
|
||||||
|
- name: config-volume
|
||||||
|
configMap:
|
||||||
|
name: litellm-config
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
@ -10,6 +10,7 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme
|
||||||
- [Async Custom Callbacks](#custom-callback-class-async)
|
- [Async Custom Callbacks](#custom-callback-class-async)
|
||||||
- [Async Custom Callback APIs](#custom-callback-apis-async)
|
- [Async Custom Callback APIs](#custom-callback-apis-async)
|
||||||
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
|
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
|
||||||
|
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
|
||||||
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
|
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
|
||||||
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
|
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
|
||||||
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
|
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
|
||||||
|
@ -401,7 +402,7 @@ litellm_settings:
|
||||||
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
|
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
|
||||||
|
|
||||||
## Logging Proxy Input/Output - Langfuse
|
## Logging Proxy Input/Output - Langfuse
|
||||||
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
|
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
|
||||||
|
|
||||||
**Step 1** Install langfuse
|
**Step 1** Install langfuse
|
||||||
|
|
||||||
|
@ -419,7 +420,13 @@ litellm_settings:
|
||||||
success_callback: ["langfuse"]
|
success_callback: ["langfuse"]
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 3**: Start the proxy, make a test request
|
**Step 3**: Set required env variables for logging to langfuse
|
||||||
|
```shell
|
||||||
|
export LANGFUSE_PUBLIC_KEY="pk_kk"
|
||||||
|
export LANGFUSE_SECRET_KEY="sk_ss
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4**: Start the proxy, make a test request
|
||||||
|
|
||||||
Start proxy
|
Start proxy
|
||||||
```shell
|
```shell
|
||||||
|
@ -569,6 +576,75 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
|
||||||
|
|
||||||
All requests made with these keys will log data to their team-specific logging.
|
All requests made with these keys will log data to their team-specific logging.
|
||||||
|
|
||||||
|
### Redacting Messages, Response Content from Langfuse Logging
|
||||||
|
|
||||||
|
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: gpt-3.5-turbo
|
||||||
|
litellm_settings:
|
||||||
|
success_callback: ["langfuse"]
|
||||||
|
turn_off_message_logging: True
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Logging Proxy Cost + Usage - OpenMeter
|
||||||
|
|
||||||
|
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
|
||||||
|
|
||||||
|
**Required Env Variables**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# from https://openmeter.cloud
|
||||||
|
export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
|
||||||
|
export OPENMETER_API_KEY=""
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
1. Add to Config.yaml
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- litellm_params:
|
||||||
|
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||||
|
api_key: my-fake-key
|
||||||
|
model: openai/my-fake-model
|
||||||
|
model_name: fake-openai-endpoint
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
success_callback: ["openmeter"] # 👈 KEY CHANGE
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start Proxy
|
||||||
|
|
||||||
|
```
|
||||||
|
litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Test it!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "fake-openai-endpoint",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
<Image img={require('../../img/openmeter_img_2.png')} />
|
||||||
|
|
||||||
## Logging Proxy Input/Output - DataDog
|
## Logging Proxy Input/Output - DataDog
|
||||||
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
|
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ print(response)
|
||||||
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
|
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
|
||||||
- `router.aimage_generation()` - async image generation calls
|
- `router.aimage_generation()` - async image generation calls
|
||||||
|
|
||||||
### Advanced - Routing Strategies
|
## Advanced - Routing Strategies
|
||||||
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
|
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
|
||||||
|
|
||||||
Router provides 4 strategies for routing your calls across multiple deployments:
|
Router provides 4 strategies for routing your calls across multiple deployments:
|
||||||
|
@ -278,6 +278,36 @@ router_settings:
|
||||||
routing_strategy_args: {"ttl": 10}
|
routing_strategy_args: {"ttl": 10}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Set Lowest Latency Buffer
|
||||||
|
|
||||||
|
Set a buffer within which deployments are candidates for making calls to.
|
||||||
|
|
||||||
|
E.g.
|
||||||
|
|
||||||
|
if you have 5 deployments
|
||||||
|
|
||||||
|
```
|
||||||
|
https://litellm-prod-1.openai.azure.com/: 0.07s
|
||||||
|
https://litellm-prod-2.openai.azure.com/: 0.1s
|
||||||
|
https://litellm-prod-3.openai.azure.com/: 0.1s
|
||||||
|
https://litellm-prod-4.openai.azure.com/: 0.1s
|
||||||
|
https://litellm-prod-5.openai.azure.com/: 4.66s
|
||||||
|
```
|
||||||
|
|
||||||
|
to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`.
|
||||||
|
|
||||||
|
**In Router**
|
||||||
|
```python
|
||||||
|
router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
|
||||||
|
```
|
||||||
|
|
||||||
|
**In Proxy**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
routing_strategy_args: {"lowest_latency_buffer": 0.5}
|
||||||
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
|
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
|
||||||
|
|
||||||
|
@ -443,6 +473,35 @@ asyncio.run(router_acompletion())
|
||||||
|
|
||||||
## Basic Reliability
|
## Basic Reliability
|
||||||
|
|
||||||
|
### Max Parallel Requests (ASYNC)
|
||||||
|
|
||||||
|
Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios.
|
||||||
|
|
||||||
|
If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
model_list = [{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-4",
|
||||||
|
...
|
||||||
|
"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
### OR ###
|
||||||
|
|
||||||
|
router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS
|
||||||
|
|
||||||
|
|
||||||
|
# deployment max parallel requests > default max parallel requests
|
||||||
|
```
|
||||||
|
|
||||||
|
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
|
||||||
|
|
||||||
### Timeouts
|
### Timeouts
|
||||||
|
|
||||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||||
|
|
|
@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
|
||||||
* API Base
|
* API Base
|
||||||
* API Version
|
* API Version
|
||||||
* API Type
|
* API Type
|
||||||
|
* Project
|
||||||
|
* Location
|
||||||
|
* Token
|
||||||
|
|
||||||
Useful Helper functions:
|
Useful Helper functions:
|
||||||
* [`check_valid_key()`](#check_valid_key)
|
* [`check_valid_key()`](#check_valid_key)
|
||||||
|
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
|
||||||
os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
|
os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Setting Project, Location, Token
|
||||||
|
|
||||||
|
For cloud providers:
|
||||||
|
- Azure
|
||||||
|
- Bedrock
|
||||||
|
- GCP
|
||||||
|
- Watson AI
|
||||||
|
|
||||||
|
you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers.
|
||||||
|
|
||||||
|
| | LiteLLM param | Watson | Vertex AI | Azure | Bedrock |
|
||||||
|
|------|--------------|--------------|--------------|--------------|--------------|
|
||||||
|
| Project | project | watsonx_project | vertex_project | n/a | n/a |
|
||||||
|
| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
|
||||||
|
| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
|
||||||
|
|
||||||
|
If you want, you can call them by their provider-specific params as well.
|
||||||
|
|
||||||
## litellm variables
|
## litellm variables
|
||||||
|
|
||||||
### litellm.api_key
|
### litellm.api_key
|
||||||
|
|
BIN
docs/my-website/img/openmeter.png
Normal file
BIN
docs/my-website/img/openmeter.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 MiB |
BIN
docs/my-website/img/openmeter_img_2.png
Normal file
BIN
docs/my-website/img/openmeter_img_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 533 KiB |
|
@ -43,6 +43,12 @@ const sidebars = {
|
||||||
"proxy/user_keys",
|
"proxy/user_keys",
|
||||||
"proxy/enterprise",
|
"proxy/enterprise",
|
||||||
"proxy/virtual_keys",
|
"proxy/virtual_keys",
|
||||||
|
"proxy/alerting",
|
||||||
|
{
|
||||||
|
type: "category",
|
||||||
|
label: "Logging",
|
||||||
|
items: ["proxy/logging", "proxy/streaming_logging"],
|
||||||
|
},
|
||||||
"proxy/team_based_routing",
|
"proxy/team_based_routing",
|
||||||
"proxy/ui",
|
"proxy/ui",
|
||||||
"proxy/cost_tracking",
|
"proxy/cost_tracking",
|
||||||
|
@ -58,11 +64,6 @@ const sidebars = {
|
||||||
"proxy/pii_masking",
|
"proxy/pii_masking",
|
||||||
"proxy/prompt_injection",
|
"proxy/prompt_injection",
|
||||||
"proxy/caching",
|
"proxy/caching",
|
||||||
{
|
|
||||||
type: "category",
|
|
||||||
label: "Logging, Alerting",
|
|
||||||
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
|
|
||||||
},
|
|
||||||
"proxy/prometheus",
|
"proxy/prometheus",
|
||||||
"proxy/call_hooks",
|
"proxy/call_hooks",
|
||||||
"proxy/rules",
|
"proxy/rules",
|
||||||
|
@ -148,6 +149,7 @@ const sidebars = {
|
||||||
"providers/openrouter",
|
"providers/openrouter",
|
||||||
"providers/custom_openai_proxy",
|
"providers/custom_openai_proxy",
|
||||||
"providers/petals",
|
"providers/petals",
|
||||||
|
"providers/watsonx",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
"proxy/custom_pricing",
|
"proxy/custom_pricing",
|
||||||
|
@ -168,6 +170,7 @@ const sidebars = {
|
||||||
"observability/custom_callback",
|
"observability/custom_callback",
|
||||||
"observability/langfuse_integration",
|
"observability/langfuse_integration",
|
||||||
"observability/sentry",
|
"observability/sentry",
|
||||||
|
"observability/openmeter",
|
||||||
"observability/promptlayer_integration",
|
"observability/promptlayer_integration",
|
||||||
"observability/wandb_integration",
|
"observability/wandb_integration",
|
||||||
"observability/langsmith_integration",
|
"observability/langsmith_integration",
|
||||||
|
@ -175,7 +178,6 @@ const sidebars = {
|
||||||
"observability/traceloop_integration",
|
"observability/traceloop_integration",
|
||||||
"observability/athina_integration",
|
"observability/athina_integration",
|
||||||
"observability/lunary_integration",
|
"observability/lunary_integration",
|
||||||
"observability/athina_integration",
|
|
||||||
"observability/helicone_integration",
|
"observability/helicone_integration",
|
||||||
"observability/supabase_integration",
|
"observability/supabase_integration",
|
||||||
`observability/telemetry`,
|
`observability/telemetry`,
|
||||||
|
|
8
litellm-js/spend-logs/package-lock.json
generated
8
litellm-js/spend-logs/package-lock.json
generated
|
@ -6,7 +6,7 @@
|
||||||
"": {
|
"": {
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@hono/node-server": "^1.9.0",
|
"@hono/node-server": "^1.9.0",
|
||||||
"hono": "^4.1.5"
|
"hono": "^4.2.7"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.11.17",
|
"@types/node": "^20.11.17",
|
||||||
|
@ -463,9 +463,9 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/hono": {
|
"node_modules/hono": {
|
||||||
"version": "4.1.5",
|
"version": "4.2.7",
|
||||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
|
"resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
|
||||||
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
|
"integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=16.0.0"
|
"node": ">=16.0.0"
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@hono/node-server": "^1.9.0",
|
"@hono/node-server": "^1.9.0",
|
||||||
"hono": "^4.1.5"
|
"hono": "^4.2.7"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.11.17",
|
"@types/node": "^20.11.17",
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import threading, requests, os
|
import threading, requests, os
|
||||||
from typing import Callable, List, Optional, Dict, Union, Any, Literal
|
from typing import Callable, List, Optional, Dict, Union, Any, Literal
|
||||||
from litellm.caching import Cache
|
from litellm.caching import Cache
|
||||||
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
|
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
KeyManagementSystem,
|
KeyManagementSystem,
|
||||||
KeyManagementSettings,
|
KeyManagementSettings,
|
||||||
|
@ -22,6 +22,7 @@ success_callback: List[Union[str, Callable]] = []
|
||||||
failure_callback: List[Union[str, Callable]] = []
|
failure_callback: List[Union[str, Callable]] = []
|
||||||
service_callback: List[Union[str, Callable]] = []
|
service_callback: List[Union[str, Callable]] = []
|
||||||
callbacks: List[Callable] = []
|
callbacks: List[Callable] = []
|
||||||
|
_custom_logger_compatible_callbacks: list = ["openmeter"]
|
||||||
_langfuse_default_tags: Optional[
|
_langfuse_default_tags: Optional[
|
||||||
List[
|
List[
|
||||||
Literal[
|
Literal[
|
||||||
|
@ -45,6 +46,7 @@ _async_failure_callback: List[Callable] = (
|
||||||
) # internal variable - async custom callbacks are routed here.
|
) # internal variable - async custom callbacks are routed here.
|
||||||
pre_call_rules: List[Callable] = []
|
pre_call_rules: List[Callable] = []
|
||||||
post_call_rules: List[Callable] = []
|
post_call_rules: List[Callable] = []
|
||||||
|
turn_off_message_logging: Optional[bool] = False
|
||||||
## end of callbacks #############
|
## end of callbacks #############
|
||||||
|
|
||||||
email: Optional[str] = (
|
email: Optional[str] = (
|
||||||
|
@ -58,6 +60,7 @@ max_tokens = 256 # OpenAI Defaults
|
||||||
drop_params = False
|
drop_params = False
|
||||||
modify_params = False
|
modify_params = False
|
||||||
retry = True
|
retry = True
|
||||||
|
### AUTH ###
|
||||||
api_key: Optional[str] = None
|
api_key: Optional[str] = None
|
||||||
openai_key: Optional[str] = None
|
openai_key: Optional[str] = None
|
||||||
azure_key: Optional[str] = None
|
azure_key: Optional[str] = None
|
||||||
|
@ -76,7 +79,12 @@ cloudflare_api_key: Optional[str] = None
|
||||||
baseten_key: Optional[str] = None
|
baseten_key: Optional[str] = None
|
||||||
aleph_alpha_key: Optional[str] = None
|
aleph_alpha_key: Optional[str] = None
|
||||||
nlp_cloud_key: Optional[str] = None
|
nlp_cloud_key: Optional[str] = None
|
||||||
|
common_cloud_provider_auth_params: dict = {
|
||||||
|
"params": ["project", "region_name", "token"],
|
||||||
|
"providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
|
||||||
|
}
|
||||||
use_client: bool = False
|
use_client: bool = False
|
||||||
|
ssl_verify: bool = True
|
||||||
disable_streaming_logging: bool = False
|
disable_streaming_logging: bool = False
|
||||||
### GUARDRAILS ###
|
### GUARDRAILS ###
|
||||||
llamaguard_model_name: Optional[str] = None
|
llamaguard_model_name: Optional[str] = None
|
||||||
|
@ -298,6 +306,7 @@ aleph_alpha_models: List = []
|
||||||
bedrock_models: List = []
|
bedrock_models: List = []
|
||||||
deepinfra_models: List = []
|
deepinfra_models: List = []
|
||||||
perplexity_models: List = []
|
perplexity_models: List = []
|
||||||
|
watsonx_models: List = []
|
||||||
for key, value in model_cost.items():
|
for key, value in model_cost.items():
|
||||||
if value.get("litellm_provider") == "openai":
|
if value.get("litellm_provider") == "openai":
|
||||||
open_ai_chat_completion_models.append(key)
|
open_ai_chat_completion_models.append(key)
|
||||||
|
@ -342,6 +351,8 @@ for key, value in model_cost.items():
|
||||||
deepinfra_models.append(key)
|
deepinfra_models.append(key)
|
||||||
elif value.get("litellm_provider") == "perplexity":
|
elif value.get("litellm_provider") == "perplexity":
|
||||||
perplexity_models.append(key)
|
perplexity_models.append(key)
|
||||||
|
elif value.get("litellm_provider") == "watsonx":
|
||||||
|
watsonx_models.append(key)
|
||||||
|
|
||||||
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
|
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
|
||||||
openai_compatible_endpoints: List = [
|
openai_compatible_endpoints: List = [
|
||||||
|
@ -478,6 +489,7 @@ model_list = (
|
||||||
+ perplexity_models
|
+ perplexity_models
|
||||||
+ maritalk_models
|
+ maritalk_models
|
||||||
+ vertex_language_models
|
+ vertex_language_models
|
||||||
|
+ watsonx_models
|
||||||
)
|
)
|
||||||
|
|
||||||
provider_list: List = [
|
provider_list: List = [
|
||||||
|
@ -516,6 +528,7 @@ provider_list: List = [
|
||||||
"cloudflare",
|
"cloudflare",
|
||||||
"xinference",
|
"xinference",
|
||||||
"fireworks_ai",
|
"fireworks_ai",
|
||||||
|
"watsonx",
|
||||||
"custom", # custom apis
|
"custom", # custom apis
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -537,6 +550,7 @@ models_by_provider: dict = {
|
||||||
"deepinfra": deepinfra_models,
|
"deepinfra": deepinfra_models,
|
||||||
"perplexity": perplexity_models,
|
"perplexity": perplexity_models,
|
||||||
"maritalk": maritalk_models,
|
"maritalk": maritalk_models,
|
||||||
|
"watsonx": watsonx_models,
|
||||||
}
|
}
|
||||||
|
|
||||||
# mapping for those models which have larger equivalents
|
# mapping for those models which have larger equivalents
|
||||||
|
@ -647,9 +661,11 @@ from .llms.bedrock import (
|
||||||
AmazonLlamaConfig,
|
AmazonLlamaConfig,
|
||||||
AmazonStabilityConfig,
|
AmazonStabilityConfig,
|
||||||
AmazonMistralConfig,
|
AmazonMistralConfig,
|
||||||
|
AmazonBedrockGlobalConfig,
|
||||||
)
|
)
|
||||||
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
||||||
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
||||||
|
from .llms.watsonx import IBMWatsonXAIConfig
|
||||||
from .main import * # type: ignore
|
from .main import * # type: ignore
|
||||||
from .integrations import *
|
from .integrations import *
|
||||||
from .exceptions import (
|
from .exceptions import (
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
set_verbose = False
|
set_verbose = False
|
||||||
|
json_logs = False
|
||||||
# Create a handler for the logger (you may need to adapt this based on your needs)
|
# Create a handler for the logger (you may need to adapt this based on your needs)
|
||||||
handler = logging.StreamHandler()
|
handler = logging.StreamHandler()
|
||||||
handler.setLevel(logging.DEBUG)
|
handler.setLevel(logging.DEBUG)
|
||||||
|
|
|
@ -12,9 +12,12 @@ import litellm
|
||||||
|
|
||||||
class LangFuseLogger:
|
class LangFuseLogger:
|
||||||
# Class variables or attributes
|
# Class variables or attributes
|
||||||
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
|
def __init__(
|
||||||
|
self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
from langfuse import Langfuse
|
from langfuse import Langfuse
|
||||||
|
import langfuse
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
|
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
|
||||||
|
@ -25,14 +28,20 @@ class LangFuseLogger:
|
||||||
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
||||||
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
|
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
|
||||||
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
|
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
|
||||||
self.Langfuse = Langfuse(
|
|
||||||
public_key=self.public_key,
|
parameters = {
|
||||||
secret_key=self.secret_key,
|
"public_key": self.public_key,
|
||||||
host=self.langfuse_host,
|
"secret_key": self.secret_key,
|
||||||
release=self.langfuse_release,
|
"host": self.langfuse_host,
|
||||||
debug=self.langfuse_debug,
|
"release": self.langfuse_release,
|
||||||
flush_interval=1, # flush interval in seconds
|
"debug": self.langfuse_debug,
|
||||||
)
|
"flush_interval": flush_interval, # flush interval in seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
if Version(langfuse.version.__version__) >= Version("2.6.0"):
|
||||||
|
parameters["sdk_integration"] = "litellm"
|
||||||
|
|
||||||
|
self.Langfuse = Langfuse(**parameters)
|
||||||
|
|
||||||
# set the current langfuse project id in the environ
|
# set the current langfuse project id in the environ
|
||||||
# this is used by Alerting to link to the correct project
|
# this is used by Alerting to link to the correct project
|
||||||
|
@ -77,13 +86,14 @@ class LangFuseLogger:
|
||||||
print_verbose,
|
print_verbose,
|
||||||
level="DEFAULT",
|
level="DEFAULT",
|
||||||
status_message=None,
|
status_message=None,
|
||||||
):
|
) -> dict:
|
||||||
# Method definition
|
# Method definition
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Langfuse Logging - Enters logging function for model {kwargs}"
|
f"Langfuse Logging - Enters logging function for model {kwargs}"
|
||||||
)
|
)
|
||||||
|
|
||||||
litellm_params = kwargs.get("litellm_params", {})
|
litellm_params = kwargs.get("litellm_params", {})
|
||||||
metadata = (
|
metadata = (
|
||||||
litellm_params.get("metadata", {}) or {}
|
litellm_params.get("metadata", {}) or {}
|
||||||
|
@ -137,8 +147,10 @@ class LangFuseLogger:
|
||||||
input = prompt
|
input = prompt
|
||||||
output = response_obj["data"]
|
output = response_obj["data"]
|
||||||
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
|
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
|
||||||
|
trace_id = None
|
||||||
|
generation_id = None
|
||||||
if self._is_langfuse_v2():
|
if self._is_langfuse_v2():
|
||||||
self._log_langfuse_v2(
|
trace_id, generation_id = self._log_langfuse_v2(
|
||||||
user_id,
|
user_id,
|
||||||
metadata,
|
metadata,
|
||||||
litellm_params,
|
litellm_params,
|
||||||
|
@ -168,10 +180,12 @@ class LangFuseLogger:
|
||||||
f"Langfuse Layer Logging - final response object: {response_obj}"
|
f"Langfuse Layer Logging - final response object: {response_obj}"
|
||||||
)
|
)
|
||||||
verbose_logger.info(f"Langfuse Layer Logging - logging success")
|
verbose_logger.info(f"Langfuse Layer Logging - logging success")
|
||||||
|
|
||||||
|
return {"trace_id": trace_id, "generation_id": generation_id}
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
|
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||||
pass
|
return {"trace_id": None, "generation_id": None}
|
||||||
|
|
||||||
async def _async_log_event(
|
async def _async_log_event(
|
||||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||||
|
@ -243,7 +257,7 @@ class LangFuseLogger:
|
||||||
response_obj,
|
response_obj,
|
||||||
level,
|
level,
|
||||||
print_verbose,
|
print_verbose,
|
||||||
):
|
) -> tuple:
|
||||||
import langfuse
|
import langfuse
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -262,15 +276,21 @@ class LangFuseLogger:
|
||||||
tags = metadata_tags
|
tags = metadata_tags
|
||||||
|
|
||||||
trace_name = metadata.get("trace_name", None)
|
trace_name = metadata.get("trace_name", None)
|
||||||
if trace_name is None:
|
trace_id = metadata.get("trace_id", None)
|
||||||
|
existing_trace_id = metadata.get("existing_trace_id", None)
|
||||||
|
if trace_name is None and existing_trace_id is None:
|
||||||
# just log `litellm-{call_type}` as the trace name
|
# just log `litellm-{call_type}` as the trace name
|
||||||
|
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
|
||||||
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||||
|
|
||||||
|
if existing_trace_id is not None:
|
||||||
|
trace_params = {"id": existing_trace_id}
|
||||||
|
else: # don't overwrite an existing trace
|
||||||
trace_params = {
|
trace_params = {
|
||||||
"name": trace_name,
|
"name": trace_name,
|
||||||
"input": input,
|
"input": input,
|
||||||
"user_id": metadata.get("trace_user_id", user_id),
|
"user_id": metadata.get("trace_user_id", user_id),
|
||||||
"id": metadata.get("trace_id", None),
|
"id": trace_id,
|
||||||
"session_id": metadata.get("session_id", None),
|
"session_id": metadata.get("session_id", None),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -335,6 +355,7 @@ class LangFuseLogger:
|
||||||
kwargs["cache_hit"] = False
|
kwargs["cache_hit"] = False
|
||||||
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
||||||
clean_metadata["cache_hit"] = kwargs["cache_hit"]
|
clean_metadata["cache_hit"] = kwargs["cache_hit"]
|
||||||
|
if existing_trace_id is None:
|
||||||
trace_params.update({"tags": tags})
|
trace_params.update({"tags": tags})
|
||||||
|
|
||||||
proxy_server_request = litellm_params.get("proxy_server_request", None)
|
proxy_server_request = litellm_params.get("proxy_server_request", None)
|
||||||
|
@ -355,8 +376,6 @@ class LangFuseLogger:
|
||||||
"headers": clean_headers,
|
"headers": clean_headers,
|
||||||
}
|
}
|
||||||
|
|
||||||
print_verbose(f"trace_params: {trace_params}")
|
|
||||||
|
|
||||||
trace = self.Langfuse.trace(**trace_params)
|
trace = self.Langfuse.trace(**trace_params)
|
||||||
|
|
||||||
generation_id = None
|
generation_id = None
|
||||||
|
@ -373,7 +392,11 @@ class LangFuseLogger:
|
||||||
# just log `litellm-{call_type}` as the generation name
|
# just log `litellm-{call_type}` as the generation name
|
||||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||||
|
|
||||||
|
if response_obj is not None and "system_fingerprint" in response_obj:
|
||||||
system_fingerprint = response_obj.get("system_fingerprint", None)
|
system_fingerprint = response_obj.get("system_fingerprint", None)
|
||||||
|
else:
|
||||||
|
system_fingerprint = None
|
||||||
|
|
||||||
if system_fingerprint is not None:
|
if system_fingerprint is not None:
|
||||||
optional_params["system_fingerprint"] = system_fingerprint
|
optional_params["system_fingerprint"] = system_fingerprint
|
||||||
|
|
||||||
|
@ -402,8 +425,9 @@ class LangFuseLogger:
|
||||||
"completion_start_time", None
|
"completion_start_time", None
|
||||||
)
|
)
|
||||||
|
|
||||||
print_verbose(f"generation_params: {generation_params}")
|
generation_client = trace.generation(**generation_params)
|
||||||
|
|
||||||
trace.generation(**generation_params)
|
return generation_client.trace_id, generation_id
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
|
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||||
|
return None, None
|
||||||
|
|
|
@ -73,10 +73,6 @@ class LangsmithLogger:
|
||||||
elif type(value) != dict and is_serializable(value=value):
|
elif type(value) != dict and is_serializable(value=value):
|
||||||
new_kwargs[key] = value
|
new_kwargs[key] = value
|
||||||
|
|
||||||
print(f"type of response: {type(response_obj)}")
|
|
||||||
for k, v in new_kwargs.items():
|
|
||||||
print(f"key={k}, type of arg: {type(v)}, value={v}")
|
|
||||||
|
|
||||||
if isinstance(response_obj, BaseModel):
|
if isinstance(response_obj, BaseModel):
|
||||||
try:
|
try:
|
||||||
response_obj = response_obj.model_dump()
|
response_obj = response_obj.model_dump()
|
||||||
|
|
123
litellm/integrations/openmeter.py
Normal file
123
litellm/integrations/openmeter.py
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
# What is this?
|
||||||
|
## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
|
||||||
|
|
||||||
|
import dotenv, os, json
|
||||||
|
import requests
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||||
|
import traceback
|
||||||
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
|
def get_utc_datetime():
|
||||||
|
import datetime as dt
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if hasattr(dt, "UTC"):
|
||||||
|
return datetime.now(dt.UTC) # type: ignore
|
||||||
|
else:
|
||||||
|
return datetime.utcnow() # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
class OpenMeterLogger(CustomLogger):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.validate_environment()
|
||||||
|
self.async_http_handler = AsyncHTTPHandler()
|
||||||
|
self.sync_http_handler = HTTPHandler()
|
||||||
|
|
||||||
|
def validate_environment(self):
|
||||||
|
"""
|
||||||
|
Expects
|
||||||
|
OPENMETER_API_ENDPOINT,
|
||||||
|
OPENMETER_API_KEY,
|
||||||
|
|
||||||
|
in the environment
|
||||||
|
"""
|
||||||
|
missing_keys = []
|
||||||
|
if litellm.get_secret("OPENMETER_API_KEY", None) is None:
|
||||||
|
missing_keys.append("OPENMETER_API_KEY")
|
||||||
|
|
||||||
|
if len(missing_keys) > 0:
|
||||||
|
raise Exception("Missing keys={} in environment.".format(missing_keys))
|
||||||
|
|
||||||
|
def _common_logic(self, kwargs: dict, response_obj):
|
||||||
|
call_id = response_obj.get("id", kwargs.get("litellm_call_id"))
|
||||||
|
dt = get_utc_datetime().isoformat()
|
||||||
|
cost = kwargs.get("response_cost", None)
|
||||||
|
model = kwargs.get("model")
|
||||||
|
usage = {}
|
||||||
|
if (
|
||||||
|
isinstance(response_obj, litellm.ModelResponse)
|
||||||
|
or isinstance(response_obj, litellm.EmbeddingResponse)
|
||||||
|
) and hasattr(response_obj, "usage"):
|
||||||
|
usage = {
|
||||||
|
"prompt_tokens": response_obj["usage"].get("prompt_tokens", 0),
|
||||||
|
"completion_tokens": response_obj["usage"].get("completion_tokens", 0),
|
||||||
|
"total_tokens": response_obj["usage"].get("total_tokens"),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"specversion": "1.0",
|
||||||
|
"type": os.getenv("OPENMETER_EVENT_TYPE", "litellm_tokens"),
|
||||||
|
"id": call_id,
|
||||||
|
"time": dt,
|
||||||
|
"subject": kwargs.get("user", ""), # end-user passed in via 'user' param
|
||||||
|
"source": "litellm-proxy",
|
||||||
|
"data": {"model": model, "cost": cost, **usage},
|
||||||
|
}
|
||||||
|
|
||||||
|
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
_url = litellm.get_secret(
|
||||||
|
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
|
||||||
|
)
|
||||||
|
if _url.endswith("/"):
|
||||||
|
_url += "api/v1/events"
|
||||||
|
else:
|
||||||
|
_url += "/api/v1/events"
|
||||||
|
|
||||||
|
api_key = litellm.get_secret("OPENMETER_API_KEY")
|
||||||
|
|
||||||
|
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
||||||
|
self.sync_http_handler.post(
|
||||||
|
url=_url,
|
||||||
|
data=_data,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/cloudevents+json",
|
||||||
|
"Authorization": "Bearer {}".format(api_key),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
_url = litellm.get_secret(
|
||||||
|
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
|
||||||
|
)
|
||||||
|
if _url.endswith("/"):
|
||||||
|
_url += "api/v1/events"
|
||||||
|
else:
|
||||||
|
_url += "/api/v1/events"
|
||||||
|
|
||||||
|
api_key = litellm.get_secret("OPENMETER_API_KEY")
|
||||||
|
|
||||||
|
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
|
||||||
|
_headers = {
|
||||||
|
"Content-Type": "application/cloudevents+json",
|
||||||
|
"Authorization": "Bearer {}".format(api_key),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await self.async_http_handler.post(
|
||||||
|
url=_url,
|
||||||
|
data=json.dumps(_data),
|
||||||
|
headers=_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nAn Exception Occurred - {str(e)}")
|
||||||
|
if hasattr(response, "text"):
|
||||||
|
print(f"\nError Message: {response.text}")
|
||||||
|
raise e
|
|
@ -7,11 +7,12 @@ import copy
|
||||||
import traceback
|
import traceback
|
||||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||||
import litellm
|
import litellm
|
||||||
from typing import List, Literal, Any, Union, Optional
|
from typing import List, Literal, Any, Union, Optional, Dict
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
import asyncio
|
import asyncio
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
class SlackAlerting:
|
class SlackAlerting:
|
||||||
|
@ -37,12 +38,28 @@ class SlackAlerting:
|
||||||
"budget_alerts",
|
"budget_alerts",
|
||||||
"db_exceptions",
|
"db_exceptions",
|
||||||
],
|
],
|
||||||
|
alert_to_webhook_url: Optional[
|
||||||
|
Dict
|
||||||
|
] = None, # if user wants to separate alerts to diff channels
|
||||||
):
|
):
|
||||||
self.alerting_threshold = alerting_threshold
|
self.alerting_threshold = alerting_threshold
|
||||||
self.alerting = alerting
|
self.alerting = alerting
|
||||||
self.alert_types = alert_types
|
self.alert_types = alert_types
|
||||||
self.internal_usage_cache = DualCache()
|
self.internal_usage_cache = DualCache()
|
||||||
self.async_http_handler = AsyncHTTPHandler()
|
self.async_http_handler = AsyncHTTPHandler()
|
||||||
|
self.alert_to_webhook_url = alert_to_webhook_url
|
||||||
|
self.langfuse_logger = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from litellm.integrations.langfuse import LangFuseLogger
|
||||||
|
|
||||||
|
self.langfuse_logger = LangFuseLogger(
|
||||||
|
os.getenv("LANGFUSE_PUBLIC_KEY"),
|
||||||
|
os.getenv("LANGFUSE_SECRET_KEY"),
|
||||||
|
flush_interval=1,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -51,6 +68,7 @@ class SlackAlerting:
|
||||||
alerting: Optional[List] = None,
|
alerting: Optional[List] = None,
|
||||||
alerting_threshold: Optional[float] = None,
|
alerting_threshold: Optional[float] = None,
|
||||||
alert_types: Optional[List] = None,
|
alert_types: Optional[List] = None,
|
||||||
|
alert_to_webhook_url: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
if alerting is not None:
|
if alerting is not None:
|
||||||
self.alerting = alerting
|
self.alerting = alerting
|
||||||
|
@ -59,6 +77,13 @@ class SlackAlerting:
|
||||||
if alert_types is not None:
|
if alert_types is not None:
|
||||||
self.alert_types = alert_types
|
self.alert_types = alert_types
|
||||||
|
|
||||||
|
if alert_to_webhook_url is not None:
|
||||||
|
# update the dict
|
||||||
|
if self.alert_to_webhook_url is None:
|
||||||
|
self.alert_to_webhook_url = alert_to_webhook_url
|
||||||
|
else:
|
||||||
|
self.alert_to_webhook_url.update(alert_to_webhook_url)
|
||||||
|
|
||||||
async def deployment_in_cooldown(self):
|
async def deployment_in_cooldown(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -81,39 +106,68 @@ class SlackAlerting:
|
||||||
request_info: str,
|
request_info: str,
|
||||||
request_data: Optional[dict] = None,
|
request_data: Optional[dict] = None,
|
||||||
kwargs: Optional[dict] = None,
|
kwargs: Optional[dict] = None,
|
||||||
|
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
||||||
|
start_time: Optional[datetime.datetime] = None,
|
||||||
|
end_time: Optional[datetime.datetime] = None,
|
||||||
):
|
):
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
# For now: do nothing as we're debugging why this is not working as expected
|
# For now: do nothing as we're debugging why this is not working as expected
|
||||||
|
if request_data is not None:
|
||||||
|
trace_id = request_data.get("metadata", {}).get(
|
||||||
|
"trace_id", None
|
||||||
|
) # get langfuse trace id
|
||||||
|
if trace_id is None:
|
||||||
|
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
||||||
|
request_data["metadata"]["trace_id"] = trace_id
|
||||||
|
elif kwargs is not None:
|
||||||
|
_litellm_params = kwargs.get("litellm_params", {})
|
||||||
|
trace_id = _litellm_params.get("metadata", {}).get(
|
||||||
|
"trace_id", None
|
||||||
|
) # get langfuse trace id
|
||||||
|
if trace_id is None:
|
||||||
|
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
||||||
|
_litellm_params["metadata"]["trace_id"] = trace_id
|
||||||
|
|
||||||
|
# Log hanging request as an error on langfuse
|
||||||
|
if type == "hanging_request":
|
||||||
|
if self.langfuse_logger is not None:
|
||||||
|
_logging_kwargs = copy.deepcopy(request_data)
|
||||||
|
if _logging_kwargs is None:
|
||||||
|
_logging_kwargs = {}
|
||||||
|
_logging_kwargs["litellm_params"] = {}
|
||||||
|
request_data = request_data or {}
|
||||||
|
_logging_kwargs["litellm_params"]["metadata"] = request_data.get(
|
||||||
|
"metadata", {}
|
||||||
|
)
|
||||||
|
# log to langfuse in a separate thread
|
||||||
|
import threading
|
||||||
|
|
||||||
|
threading.Thread(
|
||||||
|
target=self.langfuse_logger.log_event,
|
||||||
|
args=(
|
||||||
|
_logging_kwargs,
|
||||||
|
None,
|
||||||
|
start_time,
|
||||||
|
end_time,
|
||||||
|
None,
|
||||||
|
print,
|
||||||
|
"ERROR",
|
||||||
|
"Requests is hanging",
|
||||||
|
),
|
||||||
|
).start()
|
||||||
|
|
||||||
|
_langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
||||||
|
_langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
|
||||||
|
|
||||||
|
# langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
|
||||||
|
|
||||||
|
_langfuse_url = (
|
||||||
|
f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
|
||||||
|
)
|
||||||
|
request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
|
||||||
return request_info
|
return request_info
|
||||||
|
|
||||||
# if request_data is not None:
|
|
||||||
# trace_id = request_data.get("metadata", {}).get(
|
|
||||||
# "trace_id", None
|
|
||||||
# ) # get langfuse trace id
|
|
||||||
# if trace_id is None:
|
|
||||||
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
|
||||||
# request_data["metadata"]["trace_id"] = trace_id
|
|
||||||
# elif kwargs is not None:
|
|
||||||
# _litellm_params = kwargs.get("litellm_params", {})
|
|
||||||
# trace_id = _litellm_params.get("metadata", {}).get(
|
|
||||||
# "trace_id", None
|
|
||||||
# ) # get langfuse trace id
|
|
||||||
# if trace_id is None:
|
|
||||||
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
|
|
||||||
# _litellm_params["metadata"]["trace_id"] = trace_id
|
|
||||||
|
|
||||||
# _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
|
||||||
# _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
|
|
||||||
|
|
||||||
# # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
|
|
||||||
|
|
||||||
# _langfuse_url = (
|
|
||||||
# f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
|
|
||||||
# )
|
|
||||||
# request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
|
|
||||||
# return request_info
|
|
||||||
|
|
||||||
def _response_taking_too_long_callback(
|
def _response_taking_too_long_callback(
|
||||||
self,
|
self,
|
||||||
kwargs, # kwargs to completion
|
kwargs, # kwargs to completion
|
||||||
|
@ -140,7 +194,6 @@ class SlackAlerting:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def _get_deployment_latencies_to_alert(self, metadata=None):
|
def _get_deployment_latencies_to_alert(self, metadata=None):
|
||||||
|
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -156,6 +209,14 @@ class SlackAlerting:
|
||||||
_deployment_latencies = metadata["_latency_per_deployment"]
|
_deployment_latencies = metadata["_latency_per_deployment"]
|
||||||
if len(_deployment_latencies) == 0:
|
if len(_deployment_latencies) == 0:
|
||||||
return None
|
return None
|
||||||
|
try:
|
||||||
|
# try sorting deployments by latency
|
||||||
|
_deployment_latencies = sorted(
|
||||||
|
_deployment_latencies.items(), key=lambda x: x[1]
|
||||||
|
)
|
||||||
|
_deployment_latencies = dict(_deployment_latencies)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
for api_base, latency in _deployment_latencies.items():
|
for api_base, latency in _deployment_latencies.items():
|
||||||
_message_to_send += f"\n{api_base}: {round(latency,2)}s"
|
_message_to_send += f"\n{api_base}: {round(latency,2)}s"
|
||||||
_message_to_send = "```" + _message_to_send + "```"
|
_message_to_send = "```" + _message_to_send + "```"
|
||||||
|
@ -171,8 +232,6 @@ class SlackAlerting:
|
||||||
if self.alerting is None or self.alert_types is None:
|
if self.alerting is None or self.alert_types is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
if "llm_too_slow" not in self.alert_types:
|
|
||||||
return
|
|
||||||
time_difference_float, model, api_base, messages = (
|
time_difference_float, model, api_base, messages = (
|
||||||
self._response_taking_too_long_callback(
|
self._response_taking_too_long_callback(
|
||||||
kwargs=kwargs,
|
kwargs=kwargs,
|
||||||
|
@ -185,7 +244,7 @@ class SlackAlerting:
|
||||||
if time_difference_float > self.alerting_threshold:
|
if time_difference_float > self.alerting_threshold:
|
||||||
if "langfuse" in litellm.success_callback:
|
if "langfuse" in litellm.success_callback:
|
||||||
request_info = self._add_langfuse_trace_id_to_alert(
|
request_info = self._add_langfuse_trace_id_to_alert(
|
||||||
request_info=request_info, kwargs=kwargs
|
request_info=request_info, kwargs=kwargs, type="slow_response"
|
||||||
)
|
)
|
||||||
# add deployment latencies to alert
|
# add deployment latencies to alert
|
||||||
if (
|
if (
|
||||||
|
@ -205,6 +264,7 @@ class SlackAlerting:
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=slow_message + request_info,
|
message=slow_message + request_info,
|
||||||
level="Low",
|
level="Low",
|
||||||
|
alert_type="llm_too_slow",
|
||||||
)
|
)
|
||||||
|
|
||||||
async def log_failure_event(self, original_exception: Exception):
|
async def log_failure_event(self, original_exception: Exception):
|
||||||
|
@ -212,8 +272,8 @@ class SlackAlerting:
|
||||||
|
|
||||||
async def response_taking_too_long(
|
async def response_taking_too_long(
|
||||||
self,
|
self,
|
||||||
start_time: Optional[float] = None,
|
start_time: Optional[datetime.datetime] = None,
|
||||||
end_time: Optional[float] = None,
|
end_time: Optional[datetime.datetime] = None,
|
||||||
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
||||||
request_data: Optional[dict] = None,
|
request_data: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
|
@ -233,17 +293,10 @@ class SlackAlerting:
|
||||||
except:
|
except:
|
||||||
messages = ""
|
messages = ""
|
||||||
request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
|
request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
|
||||||
if "langfuse" in litellm.success_callback:
|
|
||||||
request_info = self._add_langfuse_trace_id_to_alert(
|
|
||||||
request_info=request_info, request_data=request_data
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
request_info = ""
|
request_info = ""
|
||||||
|
|
||||||
if type == "hanging_request":
|
if type == "hanging_request":
|
||||||
# Simulate a long-running operation that could take more than 5 minutes
|
|
||||||
if "llm_requests_hanging" not in self.alert_types:
|
|
||||||
return
|
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
self.alerting_threshold
|
self.alerting_threshold
|
||||||
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
|
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
|
||||||
|
@ -281,6 +334,15 @@ class SlackAlerting:
|
||||||
f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
|
f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "langfuse" in litellm.success_callback:
|
||||||
|
request_info = self._add_langfuse_trace_id_to_alert(
|
||||||
|
request_info=request_info,
|
||||||
|
request_data=request_data,
|
||||||
|
type="hanging_request",
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
# add deployment latencies to alert
|
# add deployment latencies to alert
|
||||||
_deployment_latency_map = self._get_deployment_latencies_to_alert(
|
_deployment_latency_map = self._get_deployment_latencies_to_alert(
|
||||||
metadata=request_data.get("metadata", {})
|
metadata=request_data.get("metadata", {})
|
||||||
|
@ -291,6 +353,7 @@ class SlackAlerting:
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=alerting_message + request_info,
|
message=alerting_message + request_info,
|
||||||
level="Medium",
|
level="Medium",
|
||||||
|
alert_type="llm_requests_hanging",
|
||||||
)
|
)
|
||||||
|
|
||||||
async def budget_alerts(
|
async def budget_alerts(
|
||||||
|
@ -336,8 +399,7 @@ class SlackAlerting:
|
||||||
user_info = f"\nUser ID: {user_id}\n Error {error_message}"
|
user_info = f"\nUser ID: {user_id}\n Error {error_message}"
|
||||||
message = "Failed Tracking Cost for" + user_info
|
message = "Failed Tracking Cost for" + user_info
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=message,
|
message=message, level="High", alert_type="budget_alerts"
|
||||||
level="High",
|
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
elif type == "projected_limit_exceeded" and user_info is not None:
|
elif type == "projected_limit_exceeded" and user_info is not None:
|
||||||
|
@ -353,8 +415,7 @@ class SlackAlerting:
|
||||||
"""
|
"""
|
||||||
message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
|
message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=message,
|
message=message, level="High", alert_type="budget_alerts"
|
||||||
level="High",
|
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
|
@ -382,8 +443,7 @@ class SlackAlerting:
|
||||||
result = await _cache.async_get_cache(key=message)
|
result = await _cache.async_get_cache(key=message)
|
||||||
if result is None:
|
if result is None:
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=message,
|
message=message, level="High", alert_type="budget_alerts"
|
||||||
level="High",
|
|
||||||
)
|
)
|
||||||
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
|
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
|
||||||
return
|
return
|
||||||
|
@ -395,8 +455,7 @@ class SlackAlerting:
|
||||||
result = await _cache.async_get_cache(key=cache_key)
|
result = await _cache.async_get_cache(key=cache_key)
|
||||||
if result is None:
|
if result is None:
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=message,
|
message=message, level="Medium", alert_type="budget_alerts"
|
||||||
level="Medium",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
|
await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
|
||||||
|
@ -409,15 +468,25 @@ class SlackAlerting:
|
||||||
result = await _cache.async_get_cache(key=message)
|
result = await _cache.async_get_cache(key=message)
|
||||||
if result is None:
|
if result is None:
|
||||||
await self.send_alert(
|
await self.send_alert(
|
||||||
message=message,
|
message=message, level="Low", alert_type="budget_alerts"
|
||||||
level="Low",
|
|
||||||
)
|
)
|
||||||
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
|
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
|
||||||
return
|
return
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
async def send_alert(self, message: str, level: Literal["Low", "Medium", "High"]):
|
async def send_alert(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
level: Literal["Low", "Medium", "High"],
|
||||||
|
alert_type: Literal[
|
||||||
|
"llm_exceptions",
|
||||||
|
"llm_too_slow",
|
||||||
|
"llm_requests_hanging",
|
||||||
|
"budget_alerts",
|
||||||
|
"db_exceptions",
|
||||||
|
],
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
||||||
|
|
||||||
|
@ -432,12 +501,6 @@ class SlackAlerting:
|
||||||
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
|
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
|
||||||
message: str - what is the alert about
|
message: str - what is the alert about
|
||||||
"""
|
"""
|
||||||
print(
|
|
||||||
"inside send alert for slack, message: ",
|
|
||||||
message,
|
|
||||||
"self.alerting: ",
|
|
||||||
self.alerting,
|
|
||||||
)
|
|
||||||
if self.alerting is None:
|
if self.alerting is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -453,7 +516,15 @@ class SlackAlerting:
|
||||||
if _proxy_base_url is not None:
|
if _proxy_base_url is not None:
|
||||||
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
|
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
|
||||||
|
|
||||||
|
# check if we find the slack webhook url in self.alert_to_webhook_url
|
||||||
|
if (
|
||||||
|
self.alert_to_webhook_url is not None
|
||||||
|
and alert_type in self.alert_to_webhook_url
|
||||||
|
):
|
||||||
|
slack_webhook_url = self.alert_to_webhook_url[alert_type]
|
||||||
|
else:
|
||||||
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
|
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
|
||||||
|
|
||||||
if slack_webhook_url is None:
|
if slack_webhook_url is None:
|
||||||
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
|
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
|
||||||
payload = {"text": formatted_message}
|
payload = {"text": formatted_message}
|
||||||
|
|
|
@ -96,6 +96,15 @@ class AzureOpenAIConfig(OpenAIConfig):
|
||||||
top_p,
|
top_p,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_mapped_special_auth_params(self) -> dict:
|
||||||
|
return {"token": "azure_ad_token"}
|
||||||
|
|
||||||
|
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param == "token":
|
||||||
|
optional_params["azure_ad_token"] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
def select_azure_base_url_or_endpoint(azure_client_params: dict):
|
def select_azure_base_url_or_endpoint(azure_client_params: dict):
|
||||||
# azure_client_params = {
|
# azure_client_params = {
|
||||||
|
|
|
@ -29,6 +29,24 @@ class BedrockError(Exception):
|
||||||
) # Call the base class constructor with the parameters it needs
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonBedrockGlobalConfig:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_mapped_special_auth_params(self) -> dict:
|
||||||
|
"""
|
||||||
|
Mapping of common auth params across bedrock/vertex/azure/watsonx
|
||||||
|
"""
|
||||||
|
return {"region_name": "aws_region_name"}
|
||||||
|
|
||||||
|
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
mapped_params = self.get_mapped_special_auth_params()
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param in mapped_params:
|
||||||
|
optional_params[mapped_params[param]] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
class AmazonTitanConfig:
|
class AmazonTitanConfig:
|
||||||
"""
|
"""
|
||||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
|
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
|
||||||
|
@ -666,6 +684,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
|
||||||
prompt = prompt_factory(
|
prompt = prompt_factory(
|
||||||
model=model, messages=messages, custom_llm_provider="bedrock"
|
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||||
)
|
)
|
||||||
|
elif provider == "meta":
|
||||||
|
prompt = prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="bedrock"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prompt = ""
|
prompt = ""
|
||||||
for message in messages:
|
for message in messages:
|
||||||
|
@ -945,7 +967,7 @@ def completion(
|
||||||
original_response=json.dumps(response_body),
|
original_response=json.dumps(response_body),
|
||||||
additional_args={"complete_input_dict": data},
|
additional_args={"complete_input_dict": data},
|
||||||
)
|
)
|
||||||
print_verbose(f"raw model_response: {response}")
|
print_verbose(f"raw model_response: {response_body}")
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
outputText = "default"
|
outputText = "default"
|
||||||
if provider == "ai21":
|
if provider == "ai21":
|
||||||
|
@ -1058,6 +1080,7 @@ def completion(
|
||||||
outputText = response_body.get("results")[0].get("outputText")
|
outputText = response_body.get("results")[0].get("outputText")
|
||||||
|
|
||||||
response_metadata = response.get("ResponseMetadata", {})
|
response_metadata = response.get("ResponseMetadata", {})
|
||||||
|
|
||||||
if response_metadata.get("HTTPStatusCode", 500) >= 400:
|
if response_metadata.get("HTTPStatusCode", 500) >= 400:
|
||||||
raise BedrockError(
|
raise BedrockError(
|
||||||
message=outputText,
|
message=outputText,
|
||||||
|
@ -1093,11 +1116,13 @@ def completion(
|
||||||
prompt_tokens = response_metadata.get(
|
prompt_tokens = response_metadata.get(
|
||||||
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||||
)
|
)
|
||||||
|
_text_response = model_response["choices"][0]["message"].get("content", "")
|
||||||
completion_tokens = response_metadata.get(
|
completion_tokens = response_metadata.get(
|
||||||
"x-amzn-bedrock-output-token-count",
|
"x-amzn-bedrock-output-token-count",
|
||||||
len(
|
len(
|
||||||
encoding.encode(
|
encoding.encode(
|
||||||
model_response["choices"][0]["message"].get("content", "")
|
_text_response,
|
||||||
|
disallowed_special=(),
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
|
@ -213,12 +213,13 @@ def get_ollama_response(
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
model_response["choices"][0]["finish_reason"] = "stop"
|
model_response["choices"][0]["finish_reason"] = "stop"
|
||||||
if optional_params.get("format", "") == "json":
|
if optional_params.get("format", "") == "json":
|
||||||
|
function_call = json.loads(response_json["response"])
|
||||||
message = litellm.Message(
|
message = litellm.Message(
|
||||||
content=None,
|
content=None,
|
||||||
tool_calls=[
|
tool_calls=[
|
||||||
{
|
{
|
||||||
"id": f"call_{str(uuid.uuid4())}",
|
"id": f"call_{str(uuid.uuid4())}",
|
||||||
"function": {"arguments": response_json["response"], "name": ""},
|
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||||
"type": "function",
|
"type": "function",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -310,15 +311,13 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
model_response["choices"][0]["finish_reason"] = "stop"
|
model_response["choices"][0]["finish_reason"] = "stop"
|
||||||
if data.get("format", "") == "json":
|
if data.get("format", "") == "json":
|
||||||
|
function_call = json.loads(response_json["response"])
|
||||||
message = litellm.Message(
|
message = litellm.Message(
|
||||||
content=None,
|
content=None,
|
||||||
tool_calls=[
|
tool_calls=[
|
||||||
{
|
{
|
||||||
"id": f"call_{str(uuid.uuid4())}",
|
"id": f"call_{str(uuid.uuid4())}",
|
||||||
"function": {
|
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||||
"arguments": response_json["response"],
|
|
||||||
"name": "",
|
|
||||||
},
|
|
||||||
"type": "function",
|
"type": "function",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
@ -285,15 +285,13 @@ def get_ollama_response(
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
model_response["choices"][0]["finish_reason"] = "stop"
|
model_response["choices"][0]["finish_reason"] = "stop"
|
||||||
if data.get("format", "") == "json":
|
if data.get("format", "") == "json":
|
||||||
|
function_call = json.loads(response_json["message"]["content"])
|
||||||
message = litellm.Message(
|
message = litellm.Message(
|
||||||
content=None,
|
content=None,
|
||||||
tool_calls=[
|
tool_calls=[
|
||||||
{
|
{
|
||||||
"id": f"call_{str(uuid.uuid4())}",
|
"id": f"call_{str(uuid.uuid4())}",
|
||||||
"function": {
|
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||||
"arguments": response_json["message"]["content"],
|
|
||||||
"name": "",
|
|
||||||
},
|
|
||||||
"type": "function",
|
"type": "function",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -415,15 +413,13 @@ async def ollama_acompletion(
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
model_response["choices"][0]["finish_reason"] = "stop"
|
model_response["choices"][0]["finish_reason"] = "stop"
|
||||||
if data.get("format", "") == "json":
|
if data.get("format", "") == "json":
|
||||||
|
function_call = json.loads(response_json["message"]["content"])
|
||||||
message = litellm.Message(
|
message = litellm.Message(
|
||||||
content=None,
|
content=None,
|
||||||
tool_calls=[
|
tool_calls=[
|
||||||
{
|
{
|
||||||
"id": f"call_{str(uuid.uuid4())}",
|
"id": f"call_{str(uuid.uuid4())}",
|
||||||
"function": {
|
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||||
"arguments": response_json["message"]["content"],
|
|
||||||
"name": function_name or "",
|
|
||||||
},
|
|
||||||
"type": "function",
|
"type": "function",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
openai_aclient = client
|
openai_aclient = client
|
||||||
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
|
|
@ -3,8 +3,14 @@ import requests, traceback
|
||||||
import json, re, xml.etree.ElementTree as ET
|
import json, re, xml.etree.ElementTree as ET
|
||||||
from jinja2 import Template, exceptions, meta, BaseLoader
|
from jinja2 import Template, exceptions, meta, BaseLoader
|
||||||
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||||
from typing import Optional, Any
|
from typing import (
|
||||||
from typing import List
|
Any,
|
||||||
|
List,
|
||||||
|
Mapping,
|
||||||
|
MutableMapping,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
)
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
|
|
||||||
|
@ -431,6 +437,35 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
### IBM Granite
|
||||||
|
|
||||||
|
|
||||||
|
def ibm_granite_pt(messages: list):
|
||||||
|
"""
|
||||||
|
IBM's Granite models uses the template:
|
||||||
|
<|system|> {system_message} <|user|> {user_message} <|assistant|> {assistant_message}
|
||||||
|
|
||||||
|
See: https://www.ibm.com/docs/en/watsonx-as-a-service?topic=solutions-supported-foundation-models
|
||||||
|
"""
|
||||||
|
return custom_prompt(
|
||||||
|
messages=messages,
|
||||||
|
role_dict={
|
||||||
|
"system": {
|
||||||
|
"pre_message": "<|system|>\n",
|
||||||
|
"post_message": "\n",
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"pre_message": "<|user|>\n",
|
||||||
|
"post_message": "\n",
|
||||||
|
},
|
||||||
|
"assistant": {
|
||||||
|
"pre_message": "<|assistant|>\n",
|
||||||
|
"post_message": "\n",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
|
||||||
### ANTHROPIC ###
|
### ANTHROPIC ###
|
||||||
|
|
||||||
|
|
||||||
|
@ -1017,6 +1052,30 @@ def get_system_prompt(messages):
|
||||||
return system_prompt, messages
|
return system_prompt, messages
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_documents(
|
||||||
|
observations: Any,
|
||||||
|
) -> List[MutableMapping]:
|
||||||
|
"""Converts observations into a 'document' dict"""
|
||||||
|
documents: List[MutableMapping] = []
|
||||||
|
if isinstance(observations, str):
|
||||||
|
# strings are turned into a key/value pair and a key of 'output' is added.
|
||||||
|
observations = [{"output": observations}]
|
||||||
|
elif isinstance(observations, Mapping):
|
||||||
|
# single mappings are transformed into a list to simplify the rest of the code.
|
||||||
|
observations = [observations]
|
||||||
|
elif not isinstance(observations, Sequence):
|
||||||
|
# all other types are turned into a key/value pair within a list
|
||||||
|
observations = [{"output": observations}]
|
||||||
|
|
||||||
|
for doc in observations:
|
||||||
|
if not isinstance(doc, Mapping):
|
||||||
|
# types that aren't Mapping are turned into a key/value pair.
|
||||||
|
doc = {"output": doc}
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
def convert_openai_message_to_cohere_tool_result(message):
|
def convert_openai_message_to_cohere_tool_result(message):
|
||||||
"""
|
"""
|
||||||
OpenAI message with a tool result looks like:
|
OpenAI message with a tool result looks like:
|
||||||
|
@ -1058,7 +1117,7 @@ def convert_openai_message_to_cohere_tool_result(message):
|
||||||
"parameters": {"location": "San Francisco, CA"},
|
"parameters": {"location": "San Francisco, CA"},
|
||||||
"generation_id": tool_call_id,
|
"generation_id": tool_call_id,
|
||||||
},
|
},
|
||||||
"outputs": [content],
|
"outputs": convert_to_documents(content),
|
||||||
}
|
}
|
||||||
return cohere_tool_result
|
return cohere_tool_result
|
||||||
|
|
||||||
|
@ -1071,7 +1130,7 @@ def cohere_message_pt(messages: list):
|
||||||
if message["role"] == "tool":
|
if message["role"] == "tool":
|
||||||
tool_result = convert_openai_message_to_cohere_tool_result(message)
|
tool_result = convert_openai_message_to_cohere_tool_result(message)
|
||||||
tool_results.append(tool_result)
|
tool_results.append(tool_result)
|
||||||
else:
|
elif message.get("content"):
|
||||||
prompt += message["content"] + "\n\n"
|
prompt += message["content"] + "\n\n"
|
||||||
prompt = prompt.rstrip()
|
prompt = prompt.rstrip()
|
||||||
return prompt, tool_results
|
return prompt, tool_results
|
||||||
|
@ -1346,12 +1405,47 @@ def prompt_factory(
|
||||||
return anthropic_pt(messages=messages)
|
return anthropic_pt(messages=messages)
|
||||||
elif "mistral." in model:
|
elif "mistral." in model:
|
||||||
return mistral_instruct_pt(messages=messages)
|
return mistral_instruct_pt(messages=messages)
|
||||||
|
elif "llama2" in model and "chat" in model:
|
||||||
|
return llama_2_chat_pt(messages=messages)
|
||||||
|
elif "llama3" in model and "instruct" in model:
|
||||||
|
return hf_chat_template(
|
||||||
|
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
elif custom_llm_provider == "perplexity":
|
elif custom_llm_provider == "perplexity":
|
||||||
for message in messages:
|
for message in messages:
|
||||||
message.pop("name", None)
|
message.pop("name", None)
|
||||||
return messages
|
return messages
|
||||||
elif custom_llm_provider == "azure_text":
|
elif custom_llm_provider == "azure_text":
|
||||||
return azure_text_pt(messages=messages)
|
return azure_text_pt(messages=messages)
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
if "granite" in model and "chat" in model:
|
||||||
|
# granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
|
||||||
|
return ibm_granite_pt(messages=messages)
|
||||||
|
elif "ibm-mistral" in model and "instruct" in model:
|
||||||
|
# models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
|
||||||
|
return mistral_instruct_pt(messages=messages)
|
||||||
|
elif "meta-llama/llama-3" in model and "instruct" in model:
|
||||||
|
# https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
|
||||||
|
return custom_prompt(
|
||||||
|
role_dict={
|
||||||
|
"system": {
|
||||||
|
"pre_message": "<|start_header_id|>system<|end_header_id|>\n",
|
||||||
|
"post_message": "<|eot_id|>",
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"pre_message": "<|start_header_id|>user<|end_header_id|>\n",
|
||||||
|
"post_message": "<|eot_id|>",
|
||||||
|
},
|
||||||
|
"assistant": {
|
||||||
|
"pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
|
||||||
|
"post_message": "<|eot_id|>",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
messages=messages,
|
||||||
|
initial_prompt_value="<|begin_of_text|>",
|
||||||
|
final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
if "meta-llama/llama-2" in model and "chat" in model:
|
if "meta-llama/llama-2" in model and "chat" in model:
|
||||||
return llama_2_chat_pt(messages=messages)
|
return llama_2_chat_pt(messages=messages)
|
||||||
|
@ -1359,11 +1453,8 @@ def prompt_factory(
|
||||||
"meta-llama/llama-3" in model or "meta-llama-3" in model
|
"meta-llama/llama-3" in model or "meta-llama-3" in model
|
||||||
) and "instruct" in model:
|
) and "instruct" in model:
|
||||||
return hf_chat_template(
|
return hf_chat_template(
|
||||||
model=model,
|
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
chat_template=known_tokenizer_config[ # type: ignore
|
|
||||||
"meta-llama/Meta-Llama-3-8B-Instruct"
|
|
||||||
]["tokenizer"]["chat_template"],
|
|
||||||
)
|
)
|
||||||
elif (
|
elif (
|
||||||
"tiiuae/falcon" in model
|
"tiiuae/falcon" in model
|
||||||
|
|
|
@ -112,10 +112,16 @@ def start_prediction(
|
||||||
}
|
}
|
||||||
|
|
||||||
initial_prediction_data = {
|
initial_prediction_data = {
|
||||||
"version": version_id,
|
|
||||||
"input": input_data,
|
"input": input_data,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ":" in version_id and len(version_id) > 64:
|
||||||
|
model_parts = version_id.split(":")
|
||||||
|
if (
|
||||||
|
len(model_parts) > 1 and len(model_parts[1]) == 64
|
||||||
|
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
||||||
|
initial_prediction_data["version"] = model_parts[1]
|
||||||
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=input_data["prompt"],
|
input=input_data["prompt"],
|
||||||
|
|
|
@ -143,7 +143,9 @@ class VertexAIConfig:
|
||||||
optional_params["temperature"] = value
|
optional_params["temperature"] = value
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
optional_params["top_p"] = value
|
optional_params["top_p"] = value
|
||||||
if param == "stream":
|
if (
|
||||||
|
param == "stream" and value == True
|
||||||
|
): # sending stream = False, can cause it to get passed unchecked and raise issues
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
if param == "n":
|
if param == "n":
|
||||||
optional_params["candidate_count"] = value
|
optional_params["candidate_count"] = value
|
||||||
|
@ -182,6 +184,20 @@ class VertexAIConfig:
|
||||||
pass
|
pass
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
||||||
|
def get_mapped_special_auth_params(self) -> dict:
|
||||||
|
"""
|
||||||
|
Common auth params across bedrock/vertex_ai/azure/watsonx
|
||||||
|
"""
|
||||||
|
return {"project": "vertex_project", "region_name": "vertex_location"}
|
||||||
|
|
||||||
|
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
mapped_params = self.get_mapped_special_auth_params()
|
||||||
|
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param in mapped_params:
|
||||||
|
optional_params[mapped_params[param]] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
@ -527,6 +543,7 @@ def completion(
|
||||||
"instances": instances,
|
"instances": instances,
|
||||||
"vertex_location": vertex_location,
|
"vertex_location": vertex_location,
|
||||||
"vertex_project": vertex_project,
|
"vertex_project": vertex_project,
|
||||||
|
"safety_settings": safety_settings,
|
||||||
**optional_params,
|
**optional_params,
|
||||||
}
|
}
|
||||||
if optional_params.get("stream", False) is True:
|
if optional_params.get("stream", False) is True:
|
||||||
|
@ -541,8 +558,9 @@ def completion(
|
||||||
tools = optional_params.pop("tools", None)
|
tools = optional_params.pop("tools", None)
|
||||||
prompt, images = _gemini_vision_convert_messages(messages=messages)
|
prompt, images = _gemini_vision_convert_messages(messages=messages)
|
||||||
content = [prompt] + images
|
content = [prompt] + images
|
||||||
if "stream" in optional_params and optional_params["stream"] == True:
|
stream = optional_params.pop("stream", False)
|
||||||
stream = optional_params.pop("stream")
|
if stream == True:
|
||||||
|
|
||||||
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
|
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=prompt,
|
input=prompt,
|
||||||
|
@ -810,6 +828,7 @@ async def async_completion(
|
||||||
instances=None,
|
instances=None,
|
||||||
vertex_project=None,
|
vertex_project=None,
|
||||||
vertex_location=None,
|
vertex_location=None,
|
||||||
|
safety_settings=None,
|
||||||
**optional_params,
|
**optional_params,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -820,6 +839,7 @@ async def async_completion(
|
||||||
print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
|
print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
|
||||||
print_verbose(f"\nProcessing input messages = {messages}")
|
print_verbose(f"\nProcessing input messages = {messages}")
|
||||||
tools = optional_params.pop("tools", None)
|
tools = optional_params.pop("tools", None)
|
||||||
|
stream = optional_params.pop("stream", False)
|
||||||
|
|
||||||
prompt, images = _gemini_vision_convert_messages(messages=messages)
|
prompt, images = _gemini_vision_convert_messages(messages=messages)
|
||||||
content = [prompt] + images
|
content = [prompt] + images
|
||||||
|
@ -840,6 +860,7 @@ async def async_completion(
|
||||||
response = await llm_model._generate_content_async(
|
response = await llm_model._generate_content_async(
|
||||||
contents=content,
|
contents=content,
|
||||||
generation_config=optional_params,
|
generation_config=optional_params,
|
||||||
|
safety_settings=safety_settings,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1018,6 +1039,7 @@ async def async_streaming(
|
||||||
instances=None,
|
instances=None,
|
||||||
vertex_project=None,
|
vertex_project=None,
|
||||||
vertex_location=None,
|
vertex_location=None,
|
||||||
|
safety_settings=None,
|
||||||
**optional_params,
|
**optional_params,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -1044,6 +1066,7 @@ async def async_streaming(
|
||||||
response = await llm_model._generate_content_streaming_async(
|
response = await llm_model._generate_content_streaming_async(
|
||||||
contents=content,
|
contents=content,
|
||||||
generation_config=optional_params,
|
generation_config=optional_params,
|
||||||
|
safety_settings=safety_settings,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
609
litellm/llms/watsonx.py
Normal file
609
litellm/llms/watsonx.py
Normal file
|
@ -0,0 +1,609 @@
|
||||||
|
from enum import Enum
|
||||||
|
import json, types, time # noqa: E401
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Callable, Dict, Optional, Any, Union, List
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import requests
|
||||||
|
import litellm
|
||||||
|
from litellm.utils import ModelResponse, get_secret, Usage
|
||||||
|
|
||||||
|
from .base import BaseLLM
|
||||||
|
from .prompt_templates import factory as ptf
|
||||||
|
|
||||||
|
|
||||||
|
class WatsonXAIError(Exception):
|
||||||
|
def __init__(self, status_code, message, url: Optional[str] = None):
|
||||||
|
self.status_code = status_code
|
||||||
|
self.message = message
|
||||||
|
url = url or "https://https://us-south.ml.cloud.ibm.com"
|
||||||
|
self.request = httpx.Request(method="POST", url=url)
|
||||||
|
self.response = httpx.Response(status_code=status_code, request=self.request)
|
||||||
|
super().__init__(
|
||||||
|
self.message
|
||||||
|
) # Call the base class constructor with the parameters it needs
|
||||||
|
|
||||||
|
|
||||||
|
class IBMWatsonXAIConfig:
|
||||||
|
"""
|
||||||
|
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
|
||||||
|
(See ibm_watsonx_ai.metanames.GenTextParamsMetaNames for a list of all available params)
|
||||||
|
|
||||||
|
Supported params for all available watsonx.ai foundational models.
|
||||||
|
|
||||||
|
- `decoding_method` (str): One of "greedy" or "sample"
|
||||||
|
|
||||||
|
- `temperature` (float): Sets the model temperature for sampling - not available when decoding_method='greedy'.
|
||||||
|
|
||||||
|
- `max_new_tokens` (integer): Maximum length of the generated tokens.
|
||||||
|
|
||||||
|
- `min_new_tokens` (integer): Maximum length of input tokens. Any more than this will be truncated.
|
||||||
|
|
||||||
|
- `length_penalty` (dict): A dictionary with keys "decay_factor" and "start_index".
|
||||||
|
|
||||||
|
- `stop_sequences` (string[]): list of strings to use as stop sequences.
|
||||||
|
|
||||||
|
- `top_k` (integer): top k for sampling - not available when decoding_method='greedy'.
|
||||||
|
|
||||||
|
- `top_p` (integer): top p for sampling - not available when decoding_method='greedy'.
|
||||||
|
|
||||||
|
- `repetition_penalty` (float): token repetition penalty during text generation.
|
||||||
|
|
||||||
|
- `truncate_input_tokens` (integer): Truncate input tokens to this length.
|
||||||
|
|
||||||
|
- `include_stop_sequences` (bool): If True, the stop sequence will be included at the end of the generated text in the case of a match.
|
||||||
|
|
||||||
|
- `return_options` (dict): A dictionary of options to return. Options include "input_text", "generated_tokens", "input_tokens", "token_ranks". Values are boolean.
|
||||||
|
|
||||||
|
- `random_seed` (integer): Random seed for text generation.
|
||||||
|
|
||||||
|
- `moderations` (dict): Dictionary of properties that control the moderations, for usages such as Hate and profanity (HAP) and PII filtering.
|
||||||
|
|
||||||
|
- `stream` (bool): If True, the model will return a stream of responses.
|
||||||
|
"""
|
||||||
|
|
||||||
|
decoding_method: Optional[str] = "sample"
|
||||||
|
temperature: Optional[float] = None
|
||||||
|
max_new_tokens: Optional[int] = None # litellm.max_tokens
|
||||||
|
min_new_tokens: Optional[int] = None
|
||||||
|
length_penalty: Optional[dict] = None # e.g {"decay_factor": 2.5, "start_index": 5}
|
||||||
|
stop_sequences: Optional[List[str]] = None # e.g ["}", ")", "."]
|
||||||
|
top_k: Optional[int] = None
|
||||||
|
top_p: Optional[float] = None
|
||||||
|
repetition_penalty: Optional[float] = None
|
||||||
|
truncate_input_tokens: Optional[int] = None
|
||||||
|
include_stop_sequences: Optional[bool] = False
|
||||||
|
return_options: Optional[Dict[str, bool]] = None
|
||||||
|
random_seed: Optional[int] = None # e.g 42
|
||||||
|
moderations: Optional[dict] = None
|
||||||
|
stream: Optional[bool] = False
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
decoding_method: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
max_new_tokens: Optional[int] = None,
|
||||||
|
min_new_tokens: Optional[int] = None,
|
||||||
|
length_penalty: Optional[dict] = None,
|
||||||
|
stop_sequences: Optional[List[str]] = None,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
repetition_penalty: Optional[float] = None,
|
||||||
|
truncate_input_tokens: Optional[int] = None,
|
||||||
|
include_stop_sequences: Optional[bool] = None,
|
||||||
|
return_options: Optional[dict] = None,
|
||||||
|
random_seed: Optional[int] = None,
|
||||||
|
moderations: Optional[dict] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
locals_ = locals()
|
||||||
|
for key, value in locals_.items():
|
||||||
|
if key != "self" and value is not None:
|
||||||
|
setattr(self.__class__, key, value)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_config(cls):
|
||||||
|
return {
|
||||||
|
k: v
|
||||||
|
for k, v in cls.__dict__.items()
|
||||||
|
if not k.startswith("__")
|
||||||
|
and not isinstance(
|
||||||
|
v,
|
||||||
|
(
|
||||||
|
types.FunctionType,
|
||||||
|
types.BuiltinFunctionType,
|
||||||
|
classmethod,
|
||||||
|
staticmethod,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
and v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_supported_openai_params(self):
|
||||||
|
return [
|
||||||
|
"temperature", # equivalent to temperature
|
||||||
|
"max_tokens", # equivalent to max_new_tokens
|
||||||
|
"top_p", # equivalent to top_p
|
||||||
|
"frequency_penalty", # equivalent to repetition_penalty
|
||||||
|
"stop", # equivalent to stop_sequences
|
||||||
|
"seed", # equivalent to random_seed
|
||||||
|
"stream", # equivalent to stream
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_mapped_special_auth_params(self) -> dict:
|
||||||
|
"""
|
||||||
|
Common auth params across bedrock/vertex_ai/azure/watsonx
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"project": "watsonx_project",
|
||||||
|
"region_name": "watsonx_region_name",
|
||||||
|
"token": "watsonx_token",
|
||||||
|
}
|
||||||
|
|
||||||
|
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
|
||||||
|
mapped_params = self.get_mapped_special_auth_params()
|
||||||
|
|
||||||
|
for param, value in non_default_params.items():
|
||||||
|
if param in mapped_params:
|
||||||
|
optional_params[mapped_params[param]] = value
|
||||||
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
|
def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
|
||||||
|
# handle anthropic prompts and amazon titan prompts
|
||||||
|
if model in custom_prompt_dict:
|
||||||
|
# check if the model has a registered custom prompt
|
||||||
|
model_prompt_dict = custom_prompt_dict[model]
|
||||||
|
prompt = ptf.custom_prompt(
|
||||||
|
messages=messages,
|
||||||
|
role_dict=model_prompt_dict.get(
|
||||||
|
"role_dict", model_prompt_dict.get("roles")
|
||||||
|
),
|
||||||
|
initial_prompt_value=model_prompt_dict.get("initial_prompt_value", ""),
|
||||||
|
final_prompt_value=model_prompt_dict.get("final_prompt_value", ""),
|
||||||
|
bos_token=model_prompt_dict.get("bos_token", ""),
|
||||||
|
eos_token=model_prompt_dict.get("eos_token", ""),
|
||||||
|
)
|
||||||
|
return prompt
|
||||||
|
elif provider == "ibm":
|
||||||
|
prompt = ptf.prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="watsonx"
|
||||||
|
)
|
||||||
|
elif provider == "ibm-mistralai":
|
||||||
|
prompt = ptf.mistral_instruct_pt(messages=messages)
|
||||||
|
else:
|
||||||
|
prompt = ptf.prompt_factory(
|
||||||
|
model=model, messages=messages, custom_llm_provider="watsonx"
|
||||||
|
)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
class WatsonXAIEndpoint(str, Enum):
|
||||||
|
TEXT_GENERATION = "/ml/v1/text/generation"
|
||||||
|
TEXT_GENERATION_STREAM = "/ml/v1/text/generation_stream"
|
||||||
|
DEPLOYMENT_TEXT_GENERATION = "/ml/v1/deployments/{deployment_id}/text/generation"
|
||||||
|
DEPLOYMENT_TEXT_GENERATION_STREAM = (
|
||||||
|
"/ml/v1/deployments/{deployment_id}/text/generation_stream"
|
||||||
|
)
|
||||||
|
EMBEDDINGS = "/ml/v1/text/embeddings"
|
||||||
|
PROMPTS = "/ml/v1/prompts"
|
||||||
|
|
||||||
|
|
||||||
|
class IBMWatsonXAI(BaseLLM):
|
||||||
|
"""
|
||||||
|
Class to interface with IBM Watsonx.ai API for text generation and embeddings.
|
||||||
|
|
||||||
|
Reference: https://cloud.ibm.com/apidocs/watsonx-ai
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_version = "2024-03-13"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def _prepare_text_generation_req(
|
||||||
|
self,
|
||||||
|
model_id: str,
|
||||||
|
prompt: str,
|
||||||
|
stream: bool,
|
||||||
|
optional_params: dict,
|
||||||
|
print_verbose: Optional[Callable] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Get the request parameters for text generation.
|
||||||
|
"""
|
||||||
|
api_params = self._get_api_params(optional_params, print_verbose=print_verbose)
|
||||||
|
# build auth headers
|
||||||
|
api_token = api_params.get("token")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {api_token}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
}
|
||||||
|
extra_body_params = optional_params.pop("extra_body", {})
|
||||||
|
optional_params.update(extra_body_params)
|
||||||
|
# init the payload to the text generation call
|
||||||
|
payload = {
|
||||||
|
"input": prompt,
|
||||||
|
"moderations": optional_params.pop("moderations", {}),
|
||||||
|
"parameters": optional_params,
|
||||||
|
}
|
||||||
|
request_params = dict(version=api_params["api_version"])
|
||||||
|
# text generation endpoint deployment or model / stream or not
|
||||||
|
if model_id.startswith("deployment/"):
|
||||||
|
# deployment models are passed in as 'deployment/<deployment_id>'
|
||||||
|
if api_params.get("space_id") is None:
|
||||||
|
raise WatsonXAIError(
|
||||||
|
status_code=401,
|
||||||
|
url=api_params["url"],
|
||||||
|
message="Error: space_id is required for models called using the 'deployment/' endpoint. Pass in the space_id as a parameter or set it in the WX_SPACE_ID environment variable.",
|
||||||
|
)
|
||||||
|
deployment_id = "/".join(model_id.split("/")[1:])
|
||||||
|
endpoint = (
|
||||||
|
WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION_STREAM.value
|
||||||
|
if stream
|
||||||
|
else WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION.value
|
||||||
|
)
|
||||||
|
endpoint = endpoint.format(deployment_id=deployment_id)
|
||||||
|
else:
|
||||||
|
payload["model_id"] = model_id
|
||||||
|
payload["project_id"] = api_params["project_id"]
|
||||||
|
endpoint = (
|
||||||
|
WatsonXAIEndpoint.TEXT_GENERATION_STREAM
|
||||||
|
if stream
|
||||||
|
else WatsonXAIEndpoint.TEXT_GENERATION
|
||||||
|
)
|
||||||
|
url = api_params["url"].rstrip("/") + endpoint
|
||||||
|
return dict(
|
||||||
|
method="POST", url=url, headers=headers, json=payload, params=request_params
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_api_params(
|
||||||
|
self, params: dict, print_verbose: Optional[Callable] = None
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Find watsonx.ai credentials in the params or environment variables and return the headers for authentication.
|
||||||
|
"""
|
||||||
|
# Load auth variables from params
|
||||||
|
url = params.pop("url", params.pop("api_base", params.pop("base_url", None)))
|
||||||
|
api_key = params.pop("apikey", None)
|
||||||
|
token = params.pop("token", None)
|
||||||
|
project_id = params.pop(
|
||||||
|
"project_id", params.pop("watsonx_project", None)
|
||||||
|
) # watsonx.ai project_id - allow 'watsonx_project' to be consistent with how vertex project implementation works -> reduce provider-specific params
|
||||||
|
space_id = params.pop("space_id", None) # watsonx.ai deployment space_id
|
||||||
|
region_name = params.pop("region_name", params.pop("region", None))
|
||||||
|
if region_name is None:
|
||||||
|
region_name = params.pop(
|
||||||
|
"watsonx_region_name", params.pop("watsonx_region", None)
|
||||||
|
) # consistent with how vertex ai + aws regions are accepted
|
||||||
|
wx_credentials = params.pop(
|
||||||
|
"wx_credentials",
|
||||||
|
params.pop(
|
||||||
|
"watsonx_credentials", None
|
||||||
|
), # follow {provider}_credentials, same as vertex ai
|
||||||
|
)
|
||||||
|
api_version = params.pop("api_version", IBMWatsonXAI.api_version)
|
||||||
|
# Load auth variables from environment variables
|
||||||
|
if url is None:
|
||||||
|
url = (
|
||||||
|
get_secret("WATSONX_API_BASE") # consistent with 'AZURE_API_BASE'
|
||||||
|
or get_secret("WATSONX_URL")
|
||||||
|
or get_secret("WX_URL")
|
||||||
|
or get_secret("WML_URL")
|
||||||
|
)
|
||||||
|
if api_key is None:
|
||||||
|
api_key = (
|
||||||
|
get_secret("WATSONX_APIKEY")
|
||||||
|
or get_secret("WATSONX_API_KEY")
|
||||||
|
or get_secret("WX_API_KEY")
|
||||||
|
)
|
||||||
|
if token is None:
|
||||||
|
token = get_secret("WATSONX_TOKEN") or get_secret("WX_TOKEN")
|
||||||
|
if project_id is None:
|
||||||
|
project_id = (
|
||||||
|
get_secret("WATSONX_PROJECT_ID")
|
||||||
|
or get_secret("WX_PROJECT_ID")
|
||||||
|
or get_secret("PROJECT_ID")
|
||||||
|
)
|
||||||
|
if region_name is None:
|
||||||
|
region_name = (
|
||||||
|
get_secret("WATSONX_REGION")
|
||||||
|
or get_secret("WX_REGION")
|
||||||
|
or get_secret("REGION")
|
||||||
|
)
|
||||||
|
if space_id is None:
|
||||||
|
space_id = (
|
||||||
|
get_secret("WATSONX_DEPLOYMENT_SPACE_ID")
|
||||||
|
or get_secret("WATSONX_SPACE_ID")
|
||||||
|
or get_secret("WX_SPACE_ID")
|
||||||
|
or get_secret("SPACE_ID")
|
||||||
|
)
|
||||||
|
|
||||||
|
# credentials parsing
|
||||||
|
if wx_credentials is not None:
|
||||||
|
url = wx_credentials.get("url", url)
|
||||||
|
api_key = wx_credentials.get(
|
||||||
|
"apikey", wx_credentials.get("api_key", api_key)
|
||||||
|
)
|
||||||
|
token = wx_credentials.get(
|
||||||
|
"token",
|
||||||
|
wx_credentials.get(
|
||||||
|
"watsonx_token", token
|
||||||
|
), # follow format of {provider}_token, same as azure - e.g. 'azure_ad_token=..'
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify that all required credentials are present
|
||||||
|
if url is None:
|
||||||
|
raise WatsonXAIError(
|
||||||
|
status_code=401,
|
||||||
|
message="Error: Watsonx URL not set. Set WX_URL in environment variables or pass in as a parameter.",
|
||||||
|
)
|
||||||
|
if token is None and api_key is not None:
|
||||||
|
# generate the auth token
|
||||||
|
if print_verbose:
|
||||||
|
print_verbose("Generating IAM token for Watsonx.ai")
|
||||||
|
token = self.generate_iam_token(api_key)
|
||||||
|
elif token is None and api_key is None:
|
||||||
|
raise WatsonXAIError(
|
||||||
|
status_code=401,
|
||||||
|
url=url,
|
||||||
|
message="Error: API key or token not found. Set WX_API_KEY or WX_TOKEN in environment variables or pass in as a parameter.",
|
||||||
|
)
|
||||||
|
if project_id is None:
|
||||||
|
raise WatsonXAIError(
|
||||||
|
status_code=401,
|
||||||
|
url=url,
|
||||||
|
message="Error: Watsonx project_id not set. Set WX_PROJECT_ID in environment variables or pass in as a parameter.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"api_key": api_key,
|
||||||
|
"token": token,
|
||||||
|
"project_id": project_id,
|
||||||
|
"space_id": space_id,
|
||||||
|
"region_name": region_name,
|
||||||
|
"api_version": api_version,
|
||||||
|
}
|
||||||
|
|
||||||
|
def completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: list,
|
||||||
|
custom_prompt_dict: dict,
|
||||||
|
model_response: ModelResponse,
|
||||||
|
print_verbose: Callable,
|
||||||
|
encoding,
|
||||||
|
logging_obj,
|
||||||
|
optional_params: dict,
|
||||||
|
litellm_params: Optional[dict] = None,
|
||||||
|
logger_fn=None,
|
||||||
|
timeout: Optional[float] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Send a text generation request to the IBM Watsonx.ai API.
|
||||||
|
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
|
||||||
|
"""
|
||||||
|
stream = optional_params.pop("stream", False)
|
||||||
|
|
||||||
|
# Load default configs
|
||||||
|
config = IBMWatsonXAIConfig.get_config()
|
||||||
|
for k, v in config.items():
|
||||||
|
if k not in optional_params:
|
||||||
|
optional_params[k] = v
|
||||||
|
|
||||||
|
# Make prompt to send to model
|
||||||
|
provider = model.split("/")[0]
|
||||||
|
# model_name = "/".join(model.split("/")[1:])
|
||||||
|
prompt = convert_messages_to_prompt(
|
||||||
|
model, messages, provider, custom_prompt_dict
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_text_request(request_params: dict) -> ModelResponse:
|
||||||
|
with self._manage_response(
|
||||||
|
request_params, logging_obj=logging_obj, input=prompt, timeout=timeout
|
||||||
|
) as resp:
|
||||||
|
json_resp = resp.json()
|
||||||
|
|
||||||
|
generated_text = json_resp["results"][0]["generated_text"]
|
||||||
|
prompt_tokens = json_resp["results"][0]["input_token_count"]
|
||||||
|
completion_tokens = json_resp["results"][0]["generated_token_count"]
|
||||||
|
model_response["choices"][0]["message"]["content"] = generated_text
|
||||||
|
model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
|
||||||
|
model_response["created"] = int(time.time())
|
||||||
|
model_response["model"] = model
|
||||||
|
setattr(
|
||||||
|
model_response,
|
||||||
|
"usage",
|
||||||
|
Usage(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
def process_stream_request(
|
||||||
|
request_params: dict,
|
||||||
|
) -> litellm.CustomStreamWrapper:
|
||||||
|
# stream the response - generated chunks will be handled
|
||||||
|
# by litellm.utils.CustomStreamWrapper.handle_watsonx_stream
|
||||||
|
with self._manage_response(
|
||||||
|
request_params,
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
stream=True,
|
||||||
|
input=prompt,
|
||||||
|
timeout=timeout,
|
||||||
|
) as resp:
|
||||||
|
response = litellm.CustomStreamWrapper(
|
||||||
|
resp.iter_lines(),
|
||||||
|
model=model,
|
||||||
|
custom_llm_provider="watsonx",
|
||||||
|
logging_obj=logging_obj,
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
try:
|
||||||
|
## Get the response from the model
|
||||||
|
req_params = self._prepare_text_generation_req(
|
||||||
|
model_id=model,
|
||||||
|
prompt=prompt,
|
||||||
|
stream=stream,
|
||||||
|
optional_params=optional_params,
|
||||||
|
print_verbose=print_verbose,
|
||||||
|
)
|
||||||
|
if stream:
|
||||||
|
return process_stream_request(req_params)
|
||||||
|
else:
|
||||||
|
return process_text_request(req_params)
|
||||||
|
except WatsonXAIError as e:
|
||||||
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
raise WatsonXAIError(status_code=500, message=str(e))
|
||||||
|
|
||||||
|
def embedding(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
input: Union[list, str],
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
logging_obj=None,
|
||||||
|
model_response=None,
|
||||||
|
optional_params=None,
|
||||||
|
encoding=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Send a text embedding request to the IBM Watsonx.ai API.
|
||||||
|
"""
|
||||||
|
if optional_params is None:
|
||||||
|
optional_params = {}
|
||||||
|
# Load default configs
|
||||||
|
config = IBMWatsonXAIConfig.get_config()
|
||||||
|
for k, v in config.items():
|
||||||
|
if k not in optional_params:
|
||||||
|
optional_params[k] = v
|
||||||
|
|
||||||
|
# Load auth variables from environment variables
|
||||||
|
if isinstance(input, str):
|
||||||
|
input = [input]
|
||||||
|
if api_key is not None:
|
||||||
|
optional_params["api_key"] = api_key
|
||||||
|
api_params = self._get_api_params(optional_params)
|
||||||
|
# build auth headers
|
||||||
|
api_token = api_params.get("token")
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {api_token}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "application/json",
|
||||||
|
}
|
||||||
|
# init the payload to the text generation call
|
||||||
|
payload = {
|
||||||
|
"inputs": input,
|
||||||
|
"model_id": model,
|
||||||
|
"project_id": api_params["project_id"],
|
||||||
|
"parameters": optional_params,
|
||||||
|
}
|
||||||
|
request_params = dict(version=api_params["api_version"])
|
||||||
|
url = api_params["url"].rstrip("/") + WatsonXAIEndpoint.EMBEDDINGS
|
||||||
|
# request = httpx.Request(
|
||||||
|
# "POST", url, headers=headers, json=payload, params=request_params
|
||||||
|
# )
|
||||||
|
req_params = {
|
||||||
|
"method": "POST",
|
||||||
|
"url": url,
|
||||||
|
"headers": headers,
|
||||||
|
"json": payload,
|
||||||
|
"params": request_params,
|
||||||
|
}
|
||||||
|
with self._manage_response(
|
||||||
|
req_params, logging_obj=logging_obj, input=input
|
||||||
|
) as resp:
|
||||||
|
json_resp = resp.json()
|
||||||
|
|
||||||
|
results = json_resp.get("results", [])
|
||||||
|
embedding_response = []
|
||||||
|
for idx, result in enumerate(results):
|
||||||
|
embedding_response.append(
|
||||||
|
{"object": "embedding", "index": idx, "embedding": result["embedding"]}
|
||||||
|
)
|
||||||
|
model_response["object"] = "list"
|
||||||
|
model_response["data"] = embedding_response
|
||||||
|
model_response["model"] = model
|
||||||
|
input_tokens = json_resp.get("input_token_count", 0)
|
||||||
|
model_response.usage = Usage(
|
||||||
|
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||||
|
)
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
def generate_iam_token(self, api_key=None, **params):
|
||||||
|
headers = {}
|
||||||
|
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||||
|
if api_key is None:
|
||||||
|
api_key = get_secret("WX_API_KEY") or get_secret("WATSONX_API_KEY")
|
||||||
|
if api_key is None:
|
||||||
|
raise ValueError("API key is required")
|
||||||
|
headers["Accept"] = "application/json"
|
||||||
|
data = {
|
||||||
|
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
||||||
|
"apikey": api_key,
|
||||||
|
}
|
||||||
|
response = httpx.post(
|
||||||
|
"https://iam.cloud.ibm.com/identity/token", data=data, headers=headers
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
json_data = response.json()
|
||||||
|
iam_access_token = json_data["access_token"]
|
||||||
|
self.token = iam_access_token
|
||||||
|
return iam_access_token
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _manage_response(
|
||||||
|
self,
|
||||||
|
request_params: dict,
|
||||||
|
logging_obj: Any,
|
||||||
|
stream: bool = False,
|
||||||
|
input: Optional[Any] = None,
|
||||||
|
timeout: Optional[float] = None,
|
||||||
|
):
|
||||||
|
request_str = (
|
||||||
|
f"response = {request_params['method']}(\n"
|
||||||
|
f"\turl={request_params['url']},\n"
|
||||||
|
f"\tjson={request_params['json']},\n"
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=input,
|
||||||
|
api_key=request_params["headers"].get("Authorization"),
|
||||||
|
additional_args={
|
||||||
|
"complete_input_dict": request_params["json"],
|
||||||
|
"request_str": request_str,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if timeout:
|
||||||
|
request_params["timeout"] = timeout
|
||||||
|
try:
|
||||||
|
if stream:
|
||||||
|
resp = requests.request(
|
||||||
|
**request_params,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
yield resp
|
||||||
|
else:
|
||||||
|
resp = requests.request(**request_params)
|
||||||
|
resp.raise_for_status()
|
||||||
|
yield resp
|
||||||
|
except Exception as e:
|
||||||
|
raise WatsonXAIError(status_code=500, message=str(e))
|
||||||
|
if not stream:
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=input,
|
||||||
|
api_key=request_params["headers"].get("Authorization"),
|
||||||
|
original_response=json.dumps(resp.json()),
|
||||||
|
additional_args={
|
||||||
|
"status_code": resp.status_code,
|
||||||
|
"complete_input_dict": request_params["json"],
|
||||||
|
},
|
||||||
|
)
|
|
@ -63,6 +63,7 @@ from .llms import (
|
||||||
vertex_ai,
|
vertex_ai,
|
||||||
vertex_ai_anthropic,
|
vertex_ai_anthropic,
|
||||||
maritalk,
|
maritalk,
|
||||||
|
watsonx,
|
||||||
)
|
)
|
||||||
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
|
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
|
||||||
from .llms.azure import AzureChatCompletion
|
from .llms.azure import AzureChatCompletion
|
||||||
|
@ -360,7 +361,7 @@ def mock_completion(
|
||||||
model: str,
|
model: str,
|
||||||
messages: List,
|
messages: List,
|
||||||
stream: Optional[bool] = False,
|
stream: Optional[bool] = False,
|
||||||
mock_response: str = "This is a mock request",
|
mock_response: Union[str, Exception] = "This is a mock request",
|
||||||
logging=None,
|
logging=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
|
@ -387,6 +388,20 @@ def mock_completion(
|
||||||
- If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
|
- If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
## LOGGING
|
||||||
|
if logging is not None:
|
||||||
|
logging.pre_call(
|
||||||
|
input=messages,
|
||||||
|
api_key="mock-key",
|
||||||
|
)
|
||||||
|
if isinstance(mock_response, Exception):
|
||||||
|
raise litellm.APIError(
|
||||||
|
status_code=500, # type: ignore
|
||||||
|
message=str(mock_response),
|
||||||
|
llm_provider="openai", # type: ignore
|
||||||
|
model=model, # type: ignore
|
||||||
|
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
|
||||||
|
)
|
||||||
model_response = ModelResponse(stream=stream)
|
model_response = ModelResponse(stream=stream)
|
||||||
if stream is True:
|
if stream is True:
|
||||||
# don't try to access stream object,
|
# don't try to access stream object,
|
||||||
|
@ -1864,6 +1879,43 @@ def completion(
|
||||||
|
|
||||||
## RESPONSE OBJECT
|
## RESPONSE OBJECT
|
||||||
response = response
|
response = response
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
|
||||||
|
response = watsonx.IBMWatsonXAI().completion(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
custom_prompt_dict=custom_prompt_dict,
|
||||||
|
model_response=model_response,
|
||||||
|
print_verbose=print_verbose,
|
||||||
|
optional_params=optional_params,
|
||||||
|
litellm_params=litellm_params, # type: ignore
|
||||||
|
logger_fn=logger_fn,
|
||||||
|
encoding=encoding,
|
||||||
|
logging_obj=logging,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
"stream" in optional_params
|
||||||
|
and optional_params["stream"] == True
|
||||||
|
and not isinstance(response, CustomStreamWrapper)
|
||||||
|
):
|
||||||
|
# don't try to access stream object,
|
||||||
|
response = CustomStreamWrapper(
|
||||||
|
iter(response),
|
||||||
|
model,
|
||||||
|
custom_llm_provider="watsonx",
|
||||||
|
logging_obj=logging,
|
||||||
|
)
|
||||||
|
|
||||||
|
if optional_params.get("stream", False):
|
||||||
|
## LOGGING
|
||||||
|
logging.post_call(
|
||||||
|
input=messages,
|
||||||
|
api_key=None,
|
||||||
|
original_response=response,
|
||||||
|
)
|
||||||
|
## RESPONSE OBJECT
|
||||||
|
response = response
|
||||||
elif custom_llm_provider == "vllm":
|
elif custom_llm_provider == "vllm":
|
||||||
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
|
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
|
||||||
model_response = vllm.completion(
|
model_response = vllm.completion(
|
||||||
|
@ -2943,6 +2995,15 @@ def embedding(
|
||||||
client=client,
|
client=client,
|
||||||
aembedding=aembedding,
|
aembedding=aembedding,
|
||||||
)
|
)
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
response = watsonx.IBMWatsonXAI().embedding(
|
||||||
|
model=model,
|
||||||
|
input=input,
|
||||||
|
encoding=encoding,
|
||||||
|
logging_obj=logging,
|
||||||
|
optional_params=optional_params,
|
||||||
|
model_response=EmbeddingResponse(),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
args = locals()
|
args = locals()
|
||||||
raise ValueError(f"No valid embedding model args passed in - {args}")
|
raise ValueError(f"No valid embedding model args passed in - {args}")
|
||||||
|
|
|
@ -1418,6 +1418,123 @@
|
||||||
"litellm_provider": "replicate",
|
"litellm_provider": "replicate",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"replicate/meta/llama-2-13b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-13b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-instruct-v0.2": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000003,
|
||||||
|
"output_cost_per_token": 0.000001,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"openrouter/openai/gpt-3.5-turbo": {
|
"openrouter/openai/gpt-3.5-turbo": {
|
||||||
"max_tokens": 4095,
|
"max_tokens": 4095,
|
||||||
"input_cost_per_token": 0.0000015,
|
"input_cost_per_token": 0.0000015,
|
||||||
|
@ -1455,6 +1572,17 @@
|
||||||
"litellm_provider": "openrouter",
|
"litellm_provider": "openrouter",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"openrouter/anthropic/claude-3-opus": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 200000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.000015,
|
||||||
|
"output_cost_per_token": 0.000075,
|
||||||
|
"litellm_provider": "openrouter",
|
||||||
|
"mode": "chat",
|
||||||
|
"supports_function_calling": true,
|
||||||
|
"tool_use_system_prompt_tokens": 395
|
||||||
|
},
|
||||||
"openrouter/google/palm-2-chat-bison": {
|
"openrouter/google/palm-2-chat-bison": {
|
||||||
"max_tokens": 8000,
|
"max_tokens": 8000,
|
||||||
"input_cost_per_token": 0.0000005,
|
"input_cost_per_token": 0.0000005,
|
||||||
|
@ -2379,6 +2507,24 @@
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"meta.llama3-8b-instruct-v1:0": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"max_input_tokens": 8192,
|
||||||
|
"max_output_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.0000004,
|
||||||
|
"output_cost_per_token": 0.0000006,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"meta.llama3-70b-instruct-v1:0": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"max_input_tokens": 8192,
|
||||||
|
"max_output_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.00000265,
|
||||||
|
"output_cost_per_token": 0.0000035,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
||||||
"max_tokens": 77,
|
"max_tokens": 77,
|
||||||
"max_input_tokens": 77,
|
"max_input_tokens": 77,
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
||||||
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);
|
|
@ -1 +0,0 @@
|
||||||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
|
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);
|
|
@ -1 +0,0 @@
|
||||||
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
|
|
|
@ -1 +1 @@
|
||||||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1,5 @@
|
||||||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82332,[\"127\",\"static/chunks/127-efd0436630e294eb.js\",\"931\",\"static/chunks/app/page-525d83925fd5350b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Csz8BqWx6JEoKsgLqCeCt\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
<<<<<<< HEAD
|
||||||
|
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"761\",\"static/chunks/761-05f8a8451296476c.js\",\"931\",\"static/chunks/app/page-5a4a198eefedc775.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"c5rha8cqAah-saaczjn02\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
||||||
|
=======
|
||||||
|
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"386\",\"static/chunks/386-d811195b597a2122.js\",\"931\",\"static/chunks/app/page-e0ee34389254cdf2.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"dWGL92c5LzTMn7XX6utn2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
||||||
|
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
2:I[77831,[],""]
|
2:I[77831,[],""]
|
||||||
3:I[82332,["127","static/chunks/127-efd0436630e294eb.js","931","static/chunks/app/page-525d83925fd5350b.js"],""]
|
<<<<<<< HEAD
|
||||||
|
3:I[46414,["761","static/chunks/761-05f8a8451296476c.js","931","static/chunks/app/page-5a4a198eefedc775.js"],""]
|
||||||
4:I[5613,[],""]
|
4:I[5613,[],""]
|
||||||
5:I[31778,[],""]
|
5:I[31778,[],""]
|
||||||
0:["Csz8BqWx6JEoKsgLqCeCt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/60d9f441227ccc7e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
0:["c5rha8cqAah-saaczjn02",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||||
|
=======
|
||||||
|
3:I[46414,["386","static/chunks/386-d811195b597a2122.js","931","static/chunks/app/page-e0ee34389254cdf2.js"],""]
|
||||||
|
4:I[5613,[],""]
|
||||||
|
5:I[31778,[],""]
|
||||||
|
0:["dWGL92c5LzTMn7XX6utn2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/9f51f0573c6b0365.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||||
|
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
|
||||||
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||||
1:null
|
1:null
|
||||||
|
|
|
@ -1,51 +1,15 @@
|
||||||
environment_variables:
|
|
||||||
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
|
|
||||||
general_settings:
|
|
||||||
alerting:
|
|
||||||
- slack
|
|
||||||
alerting_threshold: 300
|
|
||||||
database_connection_pool_limit: 100
|
|
||||||
database_connection_timeout: 60
|
|
||||||
health_check_interval: 300
|
|
||||||
proxy_batch_write_at: 10
|
|
||||||
ui_access_mode: all
|
|
||||||
litellm_settings:
|
|
||||||
allowed_fails: 3
|
|
||||||
failure_callback:
|
|
||||||
- prometheus
|
|
||||||
fallbacks:
|
|
||||||
- gpt-3.5-turbo:
|
|
||||||
- fake-openai-endpoint
|
|
||||||
- gpt-4
|
|
||||||
num_retries: 3
|
|
||||||
service_callback:
|
|
||||||
- prometheus_system
|
|
||||||
success_callback:
|
|
||||||
- prometheus
|
|
||||||
model_list:
|
model_list:
|
||||||
- litellm_params:
|
- litellm_params:
|
||||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||||
api_key: my-fake-key
|
api_key: my-fake-key
|
||||||
model: openai/my-fake-model
|
model: openai/my-fake-model
|
||||||
model_name: fake-openai-endpoint
|
model_name: fake-openai-endpoint
|
||||||
- litellm_params:
|
|
||||||
model: gpt-3.5-turbo
|
|
||||||
model_name: gpt-3.5-turbo
|
|
||||||
- model_name: llama-3
|
|
||||||
litellm_params:
|
|
||||||
model: replicate/meta/meta-llama-3-8b-instruct
|
|
||||||
router_settings:
|
router_settings:
|
||||||
allowed_fails: 3
|
num_retries: 0
|
||||||
context_window_fallbacks: null
|
enable_pre_call_checks: true
|
||||||
cooldown_time: 1
|
redis_host: os.environ/REDIS_HOST
|
||||||
fallbacks:
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
- gpt-3.5-turbo:
|
redis_port: os.environ/REDIS_PORT
|
||||||
- fake-openai-endpoint
|
|
||||||
- gpt-4
|
litellm_settings:
|
||||||
- gpt-3.5-turbo-3:
|
success_callback: ["openmeter"]
|
||||||
- fake-openai-endpoint
|
|
||||||
num_retries: 3
|
|
||||||
retry_after: 0
|
|
||||||
routing_strategy: simple-shuffle
|
|
||||||
routing_strategy_args: {}
|
|
||||||
timeout: 6000
|
|
|
@ -422,6 +422,9 @@ class LiteLLM_ModelTable(LiteLLMBase):
|
||||||
created_by: str
|
created_by: str
|
||||||
updated_by: str
|
updated_by: str
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
class NewUserRequest(GenerateKeyRequest):
|
class NewUserRequest(GenerateKeyRequest):
|
||||||
max_budget: Optional[float] = None
|
max_budget: Optional[float] = None
|
||||||
|
@ -485,6 +488,9 @@ class TeamBase(LiteLLMBase):
|
||||||
class NewTeamRequest(TeamBase):
|
class NewTeamRequest(TeamBase):
|
||||||
model_aliases: Optional[dict] = None
|
model_aliases: Optional[dict] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
class GlobalEndUsersSpend(LiteLLMBase):
|
class GlobalEndUsersSpend(LiteLLMBase):
|
||||||
api_key: Optional[str] = None
|
api_key: Optional[str] = None
|
||||||
|
@ -534,6 +540,9 @@ class LiteLLM_TeamTable(TeamBase):
|
||||||
budget_reset_at: Optional[datetime] = None
|
budget_reset_at: Optional[datetime] = None
|
||||||
model_id: Optional[int] = None
|
model_id: Optional[int] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
@root_validator(pre=True)
|
@root_validator(pre=True)
|
||||||
def set_model_info(cls, values):
|
def set_model_info(cls, values):
|
||||||
dict_fields = [
|
dict_fields = [
|
||||||
|
@ -570,6 +579,9 @@ class LiteLLM_BudgetTable(LiteLLMBase):
|
||||||
model_max_budget: Optional[dict] = None
|
model_max_budget: Optional[dict] = None
|
||||||
budget_duration: Optional[str] = None
|
budget_duration: Optional[str] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
class NewOrganizationRequest(LiteLLM_BudgetTable):
|
class NewOrganizationRequest(LiteLLM_BudgetTable):
|
||||||
organization_id: Optional[str] = None
|
organization_id: Optional[str] = None
|
||||||
|
@ -720,6 +732,10 @@ class ConfigGeneralSettings(LiteLLMBase):
|
||||||
None,
|
None,
|
||||||
description="List of alerting types. By default it is all alerts",
|
description="List of alerting types. By default it is all alerts",
|
||||||
)
|
)
|
||||||
|
alert_to_webhook_url: Optional[Dict] = Field(
|
||||||
|
None,
|
||||||
|
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
|
||||||
|
)
|
||||||
|
|
||||||
alerting_threshold: Optional[int] = Field(
|
alerting_threshold: Optional[int] = Field(
|
||||||
None,
|
None,
|
||||||
|
@ -896,5 +912,19 @@ class LiteLLM_SpendLogs(LiteLLMBase):
|
||||||
request_tags: Optional[Json] = None
|
request_tags: Optional[Json] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LiteLLM_ErrorLogs(LiteLLMBase):
|
||||||
|
request_id: Optional[str] = str(uuid.uuid4())
|
||||||
|
api_base: Optional[str] = ""
|
||||||
|
model_group: Optional[str] = ""
|
||||||
|
litellm_model_name: Optional[str] = ""
|
||||||
|
model_id: Optional[str] = ""
|
||||||
|
request_kwargs: Optional[dict] = {}
|
||||||
|
exception_type: Optional[str] = ""
|
||||||
|
status_code: Optional[str] = ""
|
||||||
|
exception_string: Optional[str] = ""
|
||||||
|
startTime: Union[str, datetime, None]
|
||||||
|
endTime: Union[str, datetime, None]
|
||||||
|
|
||||||
|
|
||||||
class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
|
class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
|
||||||
response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None
|
response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None
|
||||||
|
|
|
@ -95,7 +95,15 @@ def common_checks(
|
||||||
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
|
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
|
||||||
)
|
)
|
||||||
# 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
|
# 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
|
||||||
if litellm.max_budget > 0 and global_proxy_spend is not None:
|
if (
|
||||||
|
litellm.max_budget > 0
|
||||||
|
and global_proxy_spend is not None
|
||||||
|
# only run global budget checks for OpenAI routes
|
||||||
|
# Reason - the Admin UI should continue working if the proxy crosses it's global budget
|
||||||
|
and route in LiteLLMRoutes.openai_routes.value
|
||||||
|
and route != "/v1/models"
|
||||||
|
and route != "/models"
|
||||||
|
):
|
||||||
if global_proxy_spend > litellm.max_budget:
|
if global_proxy_spend > litellm.max_budget:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
|
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
|
||||||
|
|
|
@ -1059,8 +1059,18 @@ async def user_api_key_auth(
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
user_role = "unknown"
|
||||||
|
user_id = "unknown"
|
||||||
|
if user_id_information is not None and isinstance(
|
||||||
|
user_id_information, list
|
||||||
|
):
|
||||||
|
_user = user_id_information[0]
|
||||||
|
user_role = _user.get("user_role", {}).get(
|
||||||
|
"user_role", "unknown"
|
||||||
|
)
|
||||||
|
user_id = _user.get("user_id", "unknown")
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Only master key can be used to generate, delete, update info for new keys/users/teams. Route={route}"
|
f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
|
# check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
|
||||||
|
@ -1207,6 +1217,68 @@ def cost_tracking():
|
||||||
litellm.success_callback.append(_PROXY_track_cost_callback) # type: ignore
|
litellm.success_callback.append(_PROXY_track_cost_callback) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
async def _PROXY_failure_handler(
|
||||||
|
kwargs, # kwargs to completion
|
||||||
|
completion_response: litellm.ModelResponse, # response from completion
|
||||||
|
start_time=None,
|
||||||
|
end_time=None, # start/end time for completion
|
||||||
|
):
|
||||||
|
global prisma_client
|
||||||
|
if prisma_client is not None:
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
"inside _PROXY_failure_handler kwargs=", extra=kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
_exception = kwargs.get("exception")
|
||||||
|
_exception_type = _exception.__class__.__name__
|
||||||
|
_model = kwargs.get("model", None)
|
||||||
|
|
||||||
|
_optional_params = kwargs.get("optional_params", {})
|
||||||
|
_optional_params = copy.deepcopy(_optional_params)
|
||||||
|
|
||||||
|
for k, v in _optional_params.items():
|
||||||
|
v = str(v)
|
||||||
|
v = v[:100]
|
||||||
|
|
||||||
|
_status_code = "500"
|
||||||
|
try:
|
||||||
|
_status_code = str(_exception.status_code)
|
||||||
|
except:
|
||||||
|
# Don't let this fail logging the exception to the dB
|
||||||
|
pass
|
||||||
|
|
||||||
|
_litellm_params = kwargs.get("litellm_params", {}) or {}
|
||||||
|
_metadata = _litellm_params.get("metadata", {}) or {}
|
||||||
|
_model_id = _metadata.get("model_info", {}).get("id", "")
|
||||||
|
_model_group = _metadata.get("model_group", "")
|
||||||
|
api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
|
||||||
|
_exception_string = str(_exception)[:500]
|
||||||
|
|
||||||
|
error_log = LiteLLM_ErrorLogs(
|
||||||
|
request_id=str(uuid.uuid4()),
|
||||||
|
model_group=_model_group,
|
||||||
|
model_id=_model_id,
|
||||||
|
litellm_model_name=kwargs.get("model"),
|
||||||
|
request_kwargs=_optional_params,
|
||||||
|
api_base=api_base,
|
||||||
|
exception_type=_exception_type,
|
||||||
|
status_code=_status_code,
|
||||||
|
exception_string=_exception_string,
|
||||||
|
startTime=kwargs.get("start_time"),
|
||||||
|
endTime=kwargs.get("end_time"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# helper function to convert to dict on pydantic v2 & v1
|
||||||
|
error_log_dict = _get_pydantic_json_dict(error_log)
|
||||||
|
error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
|
||||||
|
|
||||||
|
await prisma_client.db.litellm_errorlogs.create(
|
||||||
|
data=error_log_dict # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def _PROXY_track_cost_callback(
|
async def _PROXY_track_cost_callback(
|
||||||
kwargs, # kwargs to completion
|
kwargs, # kwargs to completion
|
||||||
completion_response: litellm.ModelResponse, # response from completion
|
completion_response: litellm.ModelResponse, # response from completion
|
||||||
|
@ -1292,6 +1364,15 @@ async def _PROXY_track_cost_callback(
|
||||||
verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
|
verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def error_tracking():
|
||||||
|
global prisma_client, custom_db_client
|
||||||
|
if prisma_client is not None or custom_db_client is not None:
|
||||||
|
if isinstance(litellm.failure_callback, list):
|
||||||
|
verbose_proxy_logger.debug("setting litellm failure callback to track cost")
|
||||||
|
if (_PROXY_failure_handler) not in litellm.failure_callback: # type: ignore
|
||||||
|
litellm.failure_callback.append(_PROXY_failure_handler) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def _set_spend_logs_payload(
|
def _set_spend_logs_payload(
|
||||||
payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
|
payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
|
||||||
):
|
):
|
||||||
|
@ -2612,6 +2693,7 @@ class ProxyConfig:
|
||||||
environment_variables = config_data.get("environment_variables", {})
|
environment_variables = config_data.get("environment_variables", {})
|
||||||
for k, v in environment_variables.items():
|
for k, v in environment_variables.items():
|
||||||
try:
|
try:
|
||||||
|
if v is not None:
|
||||||
decoded_b64 = base64.b64decode(v)
|
decoded_b64 = base64.b64decode(v)
|
||||||
value = decrypt_value(value=decoded_b64, master_key=master_key) # type: ignore
|
value = decrypt_value(value=decoded_b64, master_key=master_key) # type: ignore
|
||||||
os.environ[k] = value
|
os.environ[k] = value
|
||||||
|
@ -2632,9 +2714,17 @@ class ProxyConfig:
|
||||||
if "alert_types" in _general_settings:
|
if "alert_types" in _general_settings:
|
||||||
general_settings["alert_types"] = _general_settings["alert_types"]
|
general_settings["alert_types"] = _general_settings["alert_types"]
|
||||||
proxy_logging_obj.alert_types = general_settings["alert_types"]
|
proxy_logging_obj.alert_types = general_settings["alert_types"]
|
||||||
proxy_logging_obj.slack_alerting_instance.alert_types = general_settings[
|
proxy_logging_obj.slack_alerting_instance.update_values(
|
||||||
"alert_types"
|
alert_types=general_settings["alert_types"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if "alert_to_webhook_url" in _general_settings:
|
||||||
|
general_settings["alert_to_webhook_url"] = _general_settings[
|
||||||
|
"alert_to_webhook_url"
|
||||||
]
|
]
|
||||||
|
proxy_logging_obj.slack_alerting_instance.update_values(
|
||||||
|
alert_to_webhook_url=general_settings["alert_to_webhook_url"]
|
||||||
|
)
|
||||||
|
|
||||||
# router settings
|
# router settings
|
||||||
if llm_router is not None and prisma_client is not None:
|
if llm_router is not None and prisma_client is not None:
|
||||||
|
@ -3176,6 +3266,9 @@ async def startup_event():
|
||||||
## COST TRACKING ##
|
## COST TRACKING ##
|
||||||
cost_tracking()
|
cost_tracking()
|
||||||
|
|
||||||
|
## Error Tracking ##
|
||||||
|
error_tracking()
|
||||||
|
|
||||||
db_writer_client = HTTPHandler()
|
db_writer_client = HTTPHandler()
|
||||||
|
|
||||||
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
|
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
|
||||||
|
@ -3655,6 +3748,17 @@ async def chat_completion(
|
||||||
if data["model"] in litellm.model_alias_map:
|
if data["model"] in litellm.model_alias_map:
|
||||||
data["model"] = litellm.model_alias_map[data["model"]]
|
data["model"] = litellm.model_alias_map[data["model"]]
|
||||||
|
|
||||||
|
## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
|
||||||
|
data["litellm_call_id"] = str(uuid.uuid4())
|
||||||
|
logging_obj, data = litellm.utils.function_setup(
|
||||||
|
original_function="acompletion",
|
||||||
|
rules_obj=litellm.utils.Rules(),
|
||||||
|
start_time=datetime.now(),
|
||||||
|
**data,
|
||||||
|
)
|
||||||
|
|
||||||
|
data["litellm_logging_obj"] = logging_obj
|
||||||
|
|
||||||
### CALL HOOKS ### - modify incoming data before calling the model
|
### CALL HOOKS ### - modify incoming data before calling the model
|
||||||
data = await proxy_logging_obj.pre_call_hook(
|
data = await proxy_logging_obj.pre_call_hook(
|
||||||
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
||||||
|
@ -7421,9 +7525,9 @@ async def model_info_v2(
|
||||||
)
|
)
|
||||||
async def model_metrics(
|
async def model_metrics(
|
||||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||||
_selected_model_group: Optional[str] = None,
|
_selected_model_group: Optional[str] = "gpt-4-32k",
|
||||||
startTime: Optional[datetime] = datetime.now() - timedelta(days=30),
|
startTime: Optional[datetime] = None,
|
||||||
endTime: Optional[datetime] = datetime.now(),
|
endTime: Optional[datetime] = None,
|
||||||
):
|
):
|
||||||
global prisma_client, llm_router
|
global prisma_client, llm_router
|
||||||
if prisma_client is None:
|
if prisma_client is None:
|
||||||
|
@ -7433,65 +7537,214 @@ async def model_metrics(
|
||||||
param="None",
|
param="None",
|
||||||
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
)
|
)
|
||||||
if _selected_model_group and llm_router is not None:
|
startTime = startTime or datetime.now() - timedelta(days=30)
|
||||||
_model_list = llm_router.get_model_list()
|
endTime = endTime or datetime.now()
|
||||||
_relevant_api_bases = []
|
|
||||||
for model in _model_list:
|
|
||||||
if model["model_name"] == _selected_model_group:
|
|
||||||
_litellm_params = model["litellm_params"]
|
|
||||||
_api_base = _litellm_params.get("api_base", "")
|
|
||||||
_relevant_api_bases.append(_api_base)
|
|
||||||
_relevant_api_bases.append(_api_base + "/openai/")
|
|
||||||
|
|
||||||
sql_query = """
|
sql_query = """
|
||||||
SELECT
|
SELECT
|
||||||
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
|
api_base,
|
||||||
COUNT(*) AS num_requests,
|
model,
|
||||||
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
|
DATE_TRUNC('day', "startTime")::DATE AS day,
|
||||||
FROM "LiteLLM_SpendLogs"
|
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) / SUM(total_tokens) AS avg_latency_per_token
|
||||||
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
|
FROM
|
||||||
AND api_base = ANY($3)
|
"LiteLLM_SpendLogs"
|
||||||
GROUP BY CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
|
WHERE
|
||||||
ORDER BY num_requests DESC
|
"startTime" >= NOW() - INTERVAL '30 days'
|
||||||
LIMIT 50;
|
AND "model" = $1 AND "cache_hit" != 'True'
|
||||||
|
GROUP BY
|
||||||
|
api_base,
|
||||||
|
model,
|
||||||
|
day
|
||||||
|
HAVING
|
||||||
|
SUM(total_tokens) > 0
|
||||||
|
ORDER BY
|
||||||
|
avg_latency_per_token DESC;
|
||||||
|
"""
|
||||||
|
_all_api_bases = set()
|
||||||
|
db_response = await prisma_client.db.query_raw(
|
||||||
|
sql_query, _selected_model_group, startTime, endTime
|
||||||
|
)
|
||||||
|
_daily_entries: dict = {} # {"Jun 23": {"model1": 0.002, "model2": 0.003}}
|
||||||
|
if db_response is not None:
|
||||||
|
for model_data in db_response:
|
||||||
|
_api_base = model_data["api_base"]
|
||||||
|
_model = model_data["model"]
|
||||||
|
_day = model_data["day"]
|
||||||
|
_avg_latency_per_token = model_data["avg_latency_per_token"]
|
||||||
|
if _day not in _daily_entries:
|
||||||
|
_daily_entries[_day] = {}
|
||||||
|
_combined_model_name = str(_model)
|
||||||
|
if "https://" in _api_base:
|
||||||
|
_combined_model_name = str(_api_base)
|
||||||
|
if "/openai/" in _combined_model_name:
|
||||||
|
_combined_model_name = _combined_model_name.split("/openai/")[0]
|
||||||
|
|
||||||
|
_all_api_bases.add(_combined_model_name)
|
||||||
|
_daily_entries[_day][_combined_model_name] = _avg_latency_per_token
|
||||||
|
|
||||||
|
"""
|
||||||
|
each entry needs to be like this:
|
||||||
|
{
|
||||||
|
date: 'Jun 23',
|
||||||
|
'gpt-4-https://api.openai.com/v1/': 0.002,
|
||||||
|
'gpt-43-https://api.openai.com-12/v1/': 0.002,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
# convert daily entries to list of dicts
|
||||||
|
|
||||||
|
response: List[dict] = []
|
||||||
|
|
||||||
|
# sort daily entries by date
|
||||||
|
_daily_entries = dict(sorted(_daily_entries.items(), key=lambda item: item[0]))
|
||||||
|
for day in _daily_entries:
|
||||||
|
entry = {"date": str(day)}
|
||||||
|
for model_key, latency in _daily_entries[day].items():
|
||||||
|
entry[model_key] = latency
|
||||||
|
response.append(entry)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": response,
|
||||||
|
"all_api_bases": list(_all_api_bases),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/model/metrics/slow_responses",
|
||||||
|
description="View number of hanging requests per model_group",
|
||||||
|
tags=["model management"],
|
||||||
|
include_in_schema=False,
|
||||||
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
)
|
||||||
|
async def model_metrics_slow_responses(
|
||||||
|
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||||
|
_selected_model_group: Optional[str] = "gpt-4-32k",
|
||||||
|
startTime: Optional[datetime] = None,
|
||||||
|
endTime: Optional[datetime] = None,
|
||||||
|
):
|
||||||
|
global prisma_client, llm_router, proxy_logging_obj
|
||||||
|
if prisma_client is None:
|
||||||
|
raise ProxyException(
|
||||||
|
message="Prisma Client is not initialized",
|
||||||
|
type="internal_error",
|
||||||
|
param="None",
|
||||||
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
)
|
||||||
|
startTime = startTime or datetime.now() - timedelta(days=30)
|
||||||
|
endTime = endTime or datetime.now()
|
||||||
|
|
||||||
|
alerting_threshold = (
|
||||||
|
proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
|
||||||
|
)
|
||||||
|
alerting_threshold = int(alerting_threshold)
|
||||||
|
|
||||||
|
sql_query = """
|
||||||
|
SELECT
|
||||||
|
api_base,
|
||||||
|
COUNT(*) AS total_count,
|
||||||
|
SUM(CASE
|
||||||
|
WHEN ("endTime" - "startTime") >= (INTERVAL '1 SECOND' * CAST($1 AS INTEGER)) THEN 1
|
||||||
|
ELSE 0
|
||||||
|
END) AS slow_count
|
||||||
|
FROM
|
||||||
|
"LiteLLM_SpendLogs"
|
||||||
|
WHERE
|
||||||
|
"model" = $2
|
||||||
|
AND "cache_hit" != 'True'
|
||||||
|
GROUP BY
|
||||||
|
api_base
|
||||||
|
ORDER BY
|
||||||
|
slow_count DESC;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
db_response = await prisma_client.db.query_raw(
|
db_response = await prisma_client.db.query_raw(
|
||||||
sql_query, startTime, endTime, _relevant_api_bases
|
sql_query, alerting_threshold, _selected_model_group
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
|
|
||||||
sql_query = """
|
if db_response is not None:
|
||||||
SELECT
|
for row in db_response:
|
||||||
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
|
_api_base = row.get("api_base") or ""
|
||||||
COUNT(*) AS num_requests,
|
if "/openai/" in _api_base:
|
||||||
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
|
_api_base = _api_base.split("/openai/")[0]
|
||||||
FROM
|
row["api_base"] = _api_base
|
||||||
"LiteLLM_SpendLogs"
|
return db_response
|
||||||
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
|
|
||||||
GROUP BY
|
|
||||||
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
|
@router.get(
|
||||||
ORDER BY
|
"/model/metrics/exceptions",
|
||||||
num_requests DESC
|
description="View number of failed requests per model on config.yaml",
|
||||||
LIMIT 50;
|
tags=["model management"],
|
||||||
|
include_in_schema=False,
|
||||||
|
dependencies=[Depends(user_api_key_auth)],
|
||||||
|
)
|
||||||
|
async def model_metrics_exceptions(
|
||||||
|
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||||
|
_selected_model_group: Optional[str] = None,
|
||||||
|
startTime: Optional[datetime] = None,
|
||||||
|
endTime: Optional[datetime] = None,
|
||||||
|
):
|
||||||
|
global prisma_client, llm_router
|
||||||
|
if prisma_client is None:
|
||||||
|
raise ProxyException(
|
||||||
|
message="Prisma Client is not initialized",
|
||||||
|
type="internal_error",
|
||||||
|
param="None",
|
||||||
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
)
|
||||||
|
|
||||||
|
startTime = startTime or datetime.now() - timedelta(days=30)
|
||||||
|
endTime = endTime or datetime.now()
|
||||||
|
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
sql_query = """
|
||||||
|
WITH cte AS (
|
||||||
|
SELECT
|
||||||
|
CASE WHEN api_base = '' THEN litellm_model_name ELSE CONCAT(litellm_model_name, '-', api_base) END AS combined_model_api_base,
|
||||||
|
exception_type,
|
||||||
|
COUNT(*) AS num_exceptions
|
||||||
|
FROM "LiteLLM_ErrorLogs"
|
||||||
|
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
|
||||||
|
GROUP BY combined_model_api_base, exception_type
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
combined_model_api_base,
|
||||||
|
COUNT(*) AS total_exceptions,
|
||||||
|
json_object_agg(exception_type, num_exceptions) AS exception_counts
|
||||||
|
FROM cte
|
||||||
|
GROUP BY combined_model_api_base
|
||||||
|
ORDER BY total_exceptions DESC
|
||||||
|
LIMIT 200;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
|
db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
|
||||||
response: List[dict] = []
|
response: List[dict] = []
|
||||||
if response is not None:
|
exception_types = set()
|
||||||
|
|
||||||
|
"""
|
||||||
|
Return Data
|
||||||
|
{
|
||||||
|
"combined_model_api_base": "gpt-3.5-turbo-https://api.openai.com/v1/,
|
||||||
|
"total_exceptions": 5,
|
||||||
|
"BadRequestException": 5,
|
||||||
|
"TimeoutException": 2
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
if db_response is not None:
|
||||||
# loop through all models
|
# loop through all models
|
||||||
for model_data in db_response:
|
for model_data in db_response:
|
||||||
model = model_data.get("combined_model_api_base", "")
|
model = model_data.get("combined_model_api_base", "")
|
||||||
num_requests = model_data.get("num_requests", 0)
|
total_exceptions = model_data.get("total_exceptions", 0)
|
||||||
avg_latency_seconds = model_data.get("avg_latency_seconds", 0)
|
exception_counts = model_data.get("exception_counts", {})
|
||||||
response.append(
|
curr_row = {
|
||||||
{
|
|
||||||
"model": model,
|
"model": model,
|
||||||
"num_requests": num_requests,
|
"total_exceptions": total_exceptions,
|
||||||
"avg_latency_seconds": avg_latency_seconds,
|
|
||||||
}
|
}
|
||||||
)
|
curr_row.update(exception_counts)
|
||||||
return response
|
response.append(curr_row)
|
||||||
|
for k, v in exception_counts.items():
|
||||||
|
exception_types.add(k)
|
||||||
|
|
||||||
|
return {"data": response, "exception_types": list(exception_types)}
|
||||||
|
|
||||||
|
|
||||||
@router.get(
|
@router.get(
|
||||||
|
@ -8453,6 +8706,13 @@ async def update_config(config_info: ConfigYAML):
|
||||||
_existing_settings = config["general_settings"]
|
_existing_settings = config["general_settings"]
|
||||||
for k, v in updated_general_settings.items():
|
for k, v in updated_general_settings.items():
|
||||||
# overwrite existing settings with updated values
|
# overwrite existing settings with updated values
|
||||||
|
if k == "alert_to_webhook_url":
|
||||||
|
# check if slack is already enabled. if not, enable it
|
||||||
|
if "slack" not in _existing_settings:
|
||||||
|
if "alerting" not in _existing_settings:
|
||||||
|
_existing_settings["alerting"] = ["slack"]
|
||||||
|
elif isinstance(_existing_settings["alerting"], list):
|
||||||
|
_existing_settings["alerting"].append("slack")
|
||||||
_existing_settings[k] = v
|
_existing_settings[k] = v
|
||||||
config["general_settings"] = _existing_settings
|
config["general_settings"] = _existing_settings
|
||||||
|
|
||||||
|
@ -8567,7 +8827,25 @@ async def get_config():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for _callback in _success_callbacks:
|
for _callback in _success_callbacks:
|
||||||
if _callback == "langfuse":
|
if _callback == "openmeter":
|
||||||
|
env_vars = [
|
||||||
|
"OPENMETER_API_KEY",
|
||||||
|
]
|
||||||
|
env_vars_dict = {}
|
||||||
|
for _var in env_vars:
|
||||||
|
env_variable = environment_variables.get(_var, None)
|
||||||
|
if env_variable is None:
|
||||||
|
env_vars_dict[_var] = None
|
||||||
|
else:
|
||||||
|
# decode + decrypt the value
|
||||||
|
decoded_b64 = base64.b64decode(env_variable)
|
||||||
|
_decrypted_value = decrypt_value(
|
||||||
|
value=decoded_b64, master_key=master_key
|
||||||
|
)
|
||||||
|
env_vars_dict[_var] = _decrypted_value
|
||||||
|
|
||||||
|
_data_to_return.append({"name": _callback, "variables": env_vars_dict})
|
||||||
|
elif _callback == "langfuse":
|
||||||
_langfuse_vars = [
|
_langfuse_vars = [
|
||||||
"LANGFUSE_PUBLIC_KEY",
|
"LANGFUSE_PUBLIC_KEY",
|
||||||
"LANGFUSE_SECRET_KEY",
|
"LANGFUSE_SECRET_KEY",
|
||||||
|
@ -8592,6 +8870,7 @@ async def get_config():
|
||||||
|
|
||||||
# Check if slack alerting is on
|
# Check if slack alerting is on
|
||||||
_alerting = _general_settings.get("alerting", [])
|
_alerting = _general_settings.get("alerting", [])
|
||||||
|
alerting_data = []
|
||||||
if "slack" in _alerting:
|
if "slack" in _alerting:
|
||||||
_slack_vars = [
|
_slack_vars = [
|
||||||
"SLACK_WEBHOOK_URL",
|
"SLACK_WEBHOOK_URL",
|
||||||
|
@ -8600,7 +8879,8 @@ async def get_config():
|
||||||
for _var in _slack_vars:
|
for _var in _slack_vars:
|
||||||
env_variable = environment_variables.get(_var, None)
|
env_variable = environment_variables.get(_var, None)
|
||||||
if env_variable is None:
|
if env_variable is None:
|
||||||
_slack_env_vars[_var] = None
|
_value = os.getenv("SLACK_WEBHOOK_URL", None)
|
||||||
|
_slack_env_vars[_var] = _value
|
||||||
else:
|
else:
|
||||||
# decode + decrypt the value
|
# decode + decrypt the value
|
||||||
decoded_b64 = base64.b64decode(env_variable)
|
decoded_b64 = base64.b64decode(env_variable)
|
||||||
|
@ -8613,19 +8893,23 @@ async def get_config():
|
||||||
_all_alert_types = (
|
_all_alert_types = (
|
||||||
proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
|
proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
|
||||||
)
|
)
|
||||||
_data_to_return.append(
|
_alerts_to_webhook = (
|
||||||
|
proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
|
||||||
|
)
|
||||||
|
alerting_data.append(
|
||||||
{
|
{
|
||||||
"name": "slack",
|
"name": "slack",
|
||||||
"variables": _slack_env_vars,
|
"variables": _slack_env_vars,
|
||||||
"alerting_types": _alerting_types,
|
"active_alerts": _alerting_types,
|
||||||
"all_alert_types": _all_alert_types,
|
"alerts_to_webhook": _alerts_to_webhook,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
_router_settings = llm_router.get_settings()
|
_router_settings = llm_router.get_settings()
|
||||||
return {
|
return {
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"data": _data_to_return,
|
"callbacks": _data_to_return,
|
||||||
|
"alerts": alerting_data,
|
||||||
"router_settings": _router_settings,
|
"router_settings": _router_settings,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -8701,9 +8985,9 @@ async def test_endpoint(request: Request):
|
||||||
)
|
)
|
||||||
async def health_services_endpoint(
|
async def health_services_endpoint(
|
||||||
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||||
service: Literal["slack_budget_alerts", "langfuse", "slack"] = fastapi.Query(
|
service: Literal[
|
||||||
description="Specify the service being hit."
|
"slack_budget_alerts", "langfuse", "slack", "openmeter"
|
||||||
),
|
] = fastapi.Query(description="Specify the service being hit."),
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Hidden endpoint.
|
Hidden endpoint.
|
||||||
|
@ -8717,7 +9001,7 @@ async def health_services_endpoint(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400, detail={"error": "Service must be specified."}
|
status_code=400, detail={"error": "Service must be specified."}
|
||||||
)
|
)
|
||||||
if service not in ["slack_budget_alerts", "langfuse", "slack"]:
|
if service not in ["slack_budget_alerts", "langfuse", "slack", "openmeter"]:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail={
|
detail={
|
||||||
|
@ -8725,6 +9009,18 @@ async def health_services_endpoint(
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if service == "openmeter":
|
||||||
|
_ = await litellm.acompletion(
|
||||||
|
model="openai/litellm-mock-response-model",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
user="litellm:/health/services",
|
||||||
|
mock_response="This is a mock response",
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": "Mock LLM request made - check openmeter.",
|
||||||
|
}
|
||||||
|
|
||||||
if service == "langfuse":
|
if service == "langfuse":
|
||||||
from litellm.integrations.langfuse import LangFuseLogger
|
from litellm.integrations.langfuse import LangFuseLogger
|
||||||
|
|
||||||
|
@ -8741,9 +9037,53 @@ async def health_services_endpoint(
|
||||||
"message": "Mock LLM request made - check langfuse.",
|
"message": "Mock LLM request made - check langfuse.",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if service == "slack" or service == "slack_budget_alerts":
|
||||||
if "slack" in general_settings.get("alerting", []):
|
if "slack" in general_settings.get("alerting", []):
|
||||||
test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
|
# test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
|
||||||
await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
|
# check if user has opted into unique_alert_webhooks
|
||||||
|
if (
|
||||||
|
proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
|
||||||
|
is not None
|
||||||
|
):
|
||||||
|
for (
|
||||||
|
alert_type
|
||||||
|
) in proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url:
|
||||||
|
"""
|
||||||
|
"llm_exceptions",
|
||||||
|
"llm_too_slow",
|
||||||
|
"llm_requests_hanging",
|
||||||
|
"budget_alerts",
|
||||||
|
"db_exceptions",
|
||||||
|
"""
|
||||||
|
# only test alert if it's in active alert types
|
||||||
|
if (
|
||||||
|
proxy_logging_obj.slack_alerting_instance.alert_types
|
||||||
|
is not None
|
||||||
|
and alert_type
|
||||||
|
not in proxy_logging_obj.slack_alerting_instance.alert_types
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
test_message = "default test message"
|
||||||
|
if alert_type == "llm_exceptions":
|
||||||
|
test_message = f"LLM Exception test alert"
|
||||||
|
elif alert_type == "llm_too_slow":
|
||||||
|
test_message = f"LLM Too Slow test alert"
|
||||||
|
elif alert_type == "llm_requests_hanging":
|
||||||
|
test_message = f"LLM Requests Hanging test alert"
|
||||||
|
elif alert_type == "budget_alerts":
|
||||||
|
test_message = f"Budget Alert test alert"
|
||||||
|
elif alert_type == "db_exceptions":
|
||||||
|
test_message = f"DB Exception test alert"
|
||||||
|
|
||||||
|
await proxy_logging_obj.alerting_handler(
|
||||||
|
message=test_message, level="Low", alert_type=alert_type
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await proxy_logging_obj.alerting_handler(
|
||||||
|
message="This is a test slack alert message",
|
||||||
|
level="Low",
|
||||||
|
alert_type="budget_alerts",
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
|
"message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
|
||||||
|
@ -8752,7 +9092,9 @@ async def health_services_endpoint(
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=422,
|
status_code=422,
|
||||||
detail={
|
detail={
|
||||||
"error": '"slack" not in proxy config: general_settings. Unable to test this.'
|
"error": '"{}" not in proxy config: general_settings. Unable to test this.'.format(
|
||||||
|
service
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -8761,7 +9103,7 @@ async def health_services_endpoint(
|
||||||
message=getattr(e, "detail", f"Authentication Error({str(e)})"),
|
message=getattr(e, "detail", f"Authentication Error({str(e)})"),
|
||||||
type="auth_error",
|
type="auth_error",
|
||||||
param=getattr(e, "param", "None"),
|
param=getattr(e, "param", "None"),
|
||||||
code=getattr(e, "status_code", status.HTTP_401_UNAUTHORIZED),
|
code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
|
||||||
)
|
)
|
||||||
elif isinstance(e, ProxyException):
|
elif isinstance(e, ProxyException):
|
||||||
raise e
|
raise e
|
||||||
|
@ -8769,7 +9111,7 @@ async def health_services_endpoint(
|
||||||
message="Authentication Error, " + str(e),
|
message="Authentication Error, " + str(e),
|
||||||
type="auth_error",
|
type="auth_error",
|
||||||
param=getattr(e, "param", "None"),
|
param=getattr(e, "param", "None"),
|
||||||
code=status.HTTP_401_UNAUTHORIZED,
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -183,6 +183,21 @@ model LiteLLM_SpendLogs {
|
||||||
end_user String?
|
end_user String?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// View spend, model, api_key per request
|
||||||
|
model LiteLLM_ErrorLogs {
|
||||||
|
request_id String @id @default(uuid())
|
||||||
|
startTime DateTime // Assuming start_time is a DateTime field
|
||||||
|
endTime DateTime // Assuming end_time is a DateTime field
|
||||||
|
api_base String @default("")
|
||||||
|
model_group String @default("") // public model_name / model_group
|
||||||
|
litellm_model_name String @default("") // model passed to litellm
|
||||||
|
model_id String @default("") // ID of model in ProxyModelTable
|
||||||
|
request_kwargs Json @default("{}")
|
||||||
|
exception_type String @default("")
|
||||||
|
exception_string String @default("")
|
||||||
|
status_code String @default("")
|
||||||
|
}
|
||||||
|
|
||||||
// Beta - allow team members to request access to a model
|
// Beta - allow team members to request access to a model
|
||||||
model LiteLLM_UserNotifications {
|
model LiteLLM_UserNotifications {
|
||||||
request_id String @id
|
request_id String @id
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Optional, List, Any, Literal, Union
|
from typing import Optional, List, Any, Literal, Union
|
||||||
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
|
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
|
||||||
import litellm, backoff
|
import litellm, backoff, traceback
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
UserAPIKeyAuth,
|
UserAPIKeyAuth,
|
||||||
DynamoDBArgs,
|
DynamoDBArgs,
|
||||||
|
@ -199,6 +199,33 @@ class ProxyLogging:
|
||||||
print_verbose(f"final data being sent to {call_type} call: {data}")
|
print_verbose(f"final data being sent to {call_type} call: {data}")
|
||||||
return data
|
return data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if "litellm_logging_obj" in data:
|
||||||
|
logging_obj: litellm.utils.Logging = data["litellm_logging_obj"]
|
||||||
|
|
||||||
|
## ASYNC FAILURE HANDLER ##
|
||||||
|
error_message = ""
|
||||||
|
if isinstance(e, HTTPException):
|
||||||
|
if isinstance(e.detail, str):
|
||||||
|
error_message = e.detail
|
||||||
|
elif isinstance(e.detail, dict):
|
||||||
|
error_message = json.dumps(e.detail)
|
||||||
|
else:
|
||||||
|
error_message = str(e)
|
||||||
|
else:
|
||||||
|
error_message = str(e)
|
||||||
|
error_raised = Exception(f"{error_message}")
|
||||||
|
await logging_obj.async_failure_handler(
|
||||||
|
exception=error_raised,
|
||||||
|
traceback_exception=traceback.format_exc(),
|
||||||
|
)
|
||||||
|
|
||||||
|
## SYNC FAILURE HANDLER ##
|
||||||
|
try:
|
||||||
|
logging_obj.failure_handler(
|
||||||
|
error_raised, traceback.format_exc()
|
||||||
|
) # DO NOT MAKE THREADED - router retry fallback relies on this!
|
||||||
|
except Exception as error_val:
|
||||||
|
pass
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
async def during_call_hook(
|
async def during_call_hook(
|
||||||
|
@ -256,7 +283,16 @@ class ProxyLogging:
|
||||||
)
|
)
|
||||||
|
|
||||||
async def alerting_handler(
|
async def alerting_handler(
|
||||||
self, message: str, level: Literal["Low", "Medium", "High"]
|
self,
|
||||||
|
message: str,
|
||||||
|
level: Literal["Low", "Medium", "High"],
|
||||||
|
alert_type: Literal[
|
||||||
|
"llm_exceptions",
|
||||||
|
"llm_too_slow",
|
||||||
|
"llm_requests_hanging",
|
||||||
|
"budget_alerts",
|
||||||
|
"db_exceptions",
|
||||||
|
],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
||||||
|
@ -289,7 +325,7 @@ class ProxyLogging:
|
||||||
for client in self.alerting:
|
for client in self.alerting:
|
||||||
if client == "slack":
|
if client == "slack":
|
||||||
await self.slack_alerting_instance.send_alert(
|
await self.slack_alerting_instance.send_alert(
|
||||||
message=message, level=level
|
message=message, level=level, alert_type=alert_type
|
||||||
)
|
)
|
||||||
elif client == "sentry":
|
elif client == "sentry":
|
||||||
if litellm.utils.sentry_sdk_instance is not None:
|
if litellm.utils.sentry_sdk_instance is not None:
|
||||||
|
@ -323,6 +359,7 @@ class ProxyLogging:
|
||||||
self.alerting_handler(
|
self.alerting_handler(
|
||||||
message=f"DB read/write call failed: {error_message}",
|
message=f"DB read/write call failed: {error_message}",
|
||||||
level="High",
|
level="High",
|
||||||
|
alert_type="db_exceptions",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -354,7 +391,9 @@ class ProxyLogging:
|
||||||
return
|
return
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self.alerting_handler(
|
self.alerting_handler(
|
||||||
message=f"LLM API call failed: {str(original_exception)}", level="High"
|
message=f"LLM API call failed: {str(original_exception)}",
|
||||||
|
level="High",
|
||||||
|
alert_type="llm_exceptions",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1738,7 +1777,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
|
||||||
usage = response_obj["usage"]
|
usage = response_obj["usage"]
|
||||||
if type(usage) == litellm.Usage:
|
if type(usage) == litellm.Usage:
|
||||||
usage = dict(usage)
|
usage = dict(usage)
|
||||||
id = response_obj.get("id", str(uuid.uuid4()))
|
id = response_obj.get("id", kwargs.get("litellm_call_id"))
|
||||||
api_key = metadata.get("user_api_key", "")
|
api_key = metadata.get("user_api_key", "")
|
||||||
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
|
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
|
||||||
# hash the api_key
|
# hash the api_key
|
||||||
|
@ -2010,6 +2049,11 @@ async def update_spend(
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
### UPDATE KEY TABLE ###
|
### UPDATE KEY TABLE ###
|
||||||
|
verbose_proxy_logger.debug(
|
||||||
|
"KEY Spend transactions: {}".format(
|
||||||
|
len(prisma_client.key_list_transactons.keys())
|
||||||
|
)
|
||||||
|
)
|
||||||
if len(prisma_client.key_list_transactons.keys()) > 0:
|
if len(prisma_client.key_list_transactons.keys()) > 0:
|
||||||
for i in range(n_retry_times + 1):
|
for i in range(n_retry_times + 1):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
|
@ -50,7 +50,6 @@ class Router:
|
||||||
model_names: List = []
|
model_names: List = []
|
||||||
cache_responses: Optional[bool] = False
|
cache_responses: Optional[bool] = False
|
||||||
default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour
|
default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour
|
||||||
num_retries: int = 0
|
|
||||||
tenacity = None
|
tenacity = None
|
||||||
leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
|
leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
|
||||||
lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
|
lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
|
||||||
|
@ -70,9 +69,11 @@ class Router:
|
||||||
] = None, # if you want to cache across model groups
|
] = None, # if you want to cache across model groups
|
||||||
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
|
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
|
||||||
## RELIABILITY ##
|
## RELIABILITY ##
|
||||||
num_retries: int = 0,
|
num_retries: Optional[int] = None,
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
default_litellm_params={}, # default params for Router.chat.completion.create
|
default_litellm_params: Optional[
|
||||||
|
dict
|
||||||
|
] = None, # default params for Router.chat.completion.create
|
||||||
default_max_parallel_requests: Optional[int] = None,
|
default_max_parallel_requests: Optional[int] = None,
|
||||||
set_verbose: bool = False,
|
set_verbose: bool = False,
|
||||||
debug_level: Literal["DEBUG", "INFO"] = "INFO",
|
debug_level: Literal["DEBUG", "INFO"] = "INFO",
|
||||||
|
@ -158,6 +159,7 @@ class Router:
|
||||||
router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
|
router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if semaphore:
|
if semaphore:
|
||||||
self.semaphore = semaphore
|
self.semaphore = semaphore
|
||||||
self.set_verbose = set_verbose
|
self.set_verbose = set_verbose
|
||||||
|
@ -229,7 +231,14 @@ class Router:
|
||||||
self.failed_calls = (
|
self.failed_calls = (
|
||||||
InMemoryCache()
|
InMemoryCache()
|
||||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||||
self.num_retries = num_retries or litellm.num_retries or 0
|
|
||||||
|
if num_retries is not None:
|
||||||
|
self.num_retries = num_retries
|
||||||
|
elif litellm.num_retries is not None:
|
||||||
|
self.num_retries = litellm.num_retries
|
||||||
|
else:
|
||||||
|
self.num_retries = openai.DEFAULT_MAX_RETRIES
|
||||||
|
|
||||||
self.timeout = timeout or litellm.request_timeout
|
self.timeout = timeout or litellm.request_timeout
|
||||||
|
|
||||||
self.retry_after = retry_after
|
self.retry_after = retry_after
|
||||||
|
@ -255,6 +264,7 @@ class Router:
|
||||||
) # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group
|
) # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group
|
||||||
|
|
||||||
# make Router.chat.completions.create compatible for openai.chat.completions.create
|
# make Router.chat.completions.create compatible for openai.chat.completions.create
|
||||||
|
default_litellm_params = default_litellm_params or {}
|
||||||
self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)
|
self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)
|
||||||
|
|
||||||
# default litellm args
|
# default litellm args
|
||||||
|
@ -280,6 +290,21 @@ class Router:
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
### ROUTING SETUP ###
|
### ROUTING SETUP ###
|
||||||
|
self.routing_strategy_init(
|
||||||
|
routing_strategy=routing_strategy,
|
||||||
|
routing_strategy_args=routing_strategy_args,
|
||||||
|
)
|
||||||
|
## COOLDOWNS ##
|
||||||
|
if isinstance(litellm.failure_callback, list):
|
||||||
|
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
||||||
|
else:
|
||||||
|
litellm.failure_callback = [self.deployment_callback_on_failure]
|
||||||
|
print( # noqa
|
||||||
|
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
|
||||||
|
) # noqa
|
||||||
|
self.routing_strategy_args = routing_strategy_args
|
||||||
|
|
||||||
|
def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
|
||||||
if routing_strategy == "least-busy":
|
if routing_strategy == "least-busy":
|
||||||
self.leastbusy_logger = LeastBusyLoggingHandler(
|
self.leastbusy_logger = LeastBusyLoggingHandler(
|
||||||
router_cache=self.cache, model_list=self.model_list
|
router_cache=self.cache, model_list=self.model_list
|
||||||
|
@ -311,15 +336,6 @@ class Router:
|
||||||
)
|
)
|
||||||
if isinstance(litellm.callbacks, list):
|
if isinstance(litellm.callbacks, list):
|
||||||
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
|
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
|
||||||
## COOLDOWNS ##
|
|
||||||
if isinstance(litellm.failure_callback, list):
|
|
||||||
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
|
||||||
else:
|
|
||||||
litellm.failure_callback = [self.deployment_callback_on_failure]
|
|
||||||
verbose_router_logger.info(
|
|
||||||
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
|
|
||||||
)
|
|
||||||
self.routing_strategy_args = routing_strategy_args
|
|
||||||
|
|
||||||
def print_deployment(self, deployment: dict):
|
def print_deployment(self, deployment: dict):
|
||||||
"""
|
"""
|
||||||
|
@ -428,6 +444,7 @@ class Router:
|
||||||
kwargs["messages"] = messages
|
kwargs["messages"] = messages
|
||||||
kwargs["original_function"] = self._acompletion
|
kwargs["original_function"] = self._acompletion
|
||||||
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
|
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
|
||||||
|
|
||||||
timeout = kwargs.get("request_timeout", self.timeout)
|
timeout = kwargs.get("request_timeout", self.timeout)
|
||||||
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
kwargs.setdefault("metadata", {}).update({"model_group": model})
|
||||||
|
|
||||||
|
@ -469,6 +486,7 @@ class Router:
|
||||||
)
|
)
|
||||||
kwargs["model_info"] = deployment.get("model_info", {})
|
kwargs["model_info"] = deployment.get("model_info", {})
|
||||||
data = deployment["litellm_params"].copy()
|
data = deployment["litellm_params"].copy()
|
||||||
|
|
||||||
model_name = data["model"]
|
model_name = data["model"]
|
||||||
for k, v in self.default_litellm_params.items():
|
for k, v in self.default_litellm_params.items():
|
||||||
if (
|
if (
|
||||||
|
@ -1415,10 +1433,12 @@ class Router:
|
||||||
context_window_fallbacks = kwargs.pop(
|
context_window_fallbacks = kwargs.pop(
|
||||||
"context_window_fallbacks", self.context_window_fallbacks
|
"context_window_fallbacks", self.context_window_fallbacks
|
||||||
)
|
)
|
||||||
verbose_router_logger.debug(
|
|
||||||
f"async function w/ retries: original_function - {original_function}"
|
|
||||||
)
|
|
||||||
num_retries = kwargs.pop("num_retries")
|
num_retries = kwargs.pop("num_retries")
|
||||||
|
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
||||||
response = await original_function(*args, **kwargs)
|
response = await original_function(*args, **kwargs)
|
||||||
|
@ -1436,37 +1456,47 @@ class Router:
|
||||||
raise original_exception
|
raise original_exception
|
||||||
### RETRY
|
### RETRY
|
||||||
#### check if it should retry + back-off if required
|
#### check if it should retry + back-off if required
|
||||||
if "No models available" in str(e):
|
# if "No models available" in str(
|
||||||
timeout = litellm._calculate_retry_after(
|
# e
|
||||||
remaining_retries=num_retries,
|
# ) or RouterErrors.no_deployments_available.value in str(e):
|
||||||
max_retries=num_retries,
|
# timeout = litellm._calculate_retry_after(
|
||||||
min_timeout=self.retry_after,
|
# remaining_retries=num_retries,
|
||||||
)
|
# max_retries=num_retries,
|
||||||
await asyncio.sleep(timeout)
|
# min_timeout=self.retry_after,
|
||||||
elif RouterErrors.user_defined_ratelimit_error.value in str(e):
|
# )
|
||||||
raise e # don't wait to retry if deployment hits user-defined rate-limit
|
# await asyncio.sleep(timeout)
|
||||||
elif hasattr(original_exception, "status_code") and litellm._should_retry(
|
# elif RouterErrors.user_defined_ratelimit_error.value in str(e):
|
||||||
status_code=original_exception.status_code
|
# raise e # don't wait to retry if deployment hits user-defined rate-limit
|
||||||
):
|
|
||||||
if hasattr(original_exception, "response") and hasattr(
|
|
||||||
original_exception.response, "headers"
|
|
||||||
):
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=num_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
response_headers=original_exception.response.headers,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=num_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
await asyncio.sleep(timeout)
|
|
||||||
else:
|
|
||||||
raise original_exception
|
|
||||||
|
|
||||||
|
# elif hasattr(original_exception, "status_code") and litellm._should_retry(
|
||||||
|
# status_code=original_exception.status_code
|
||||||
|
# ):
|
||||||
|
# if hasattr(original_exception, "response") and hasattr(
|
||||||
|
# original_exception.response, "headers"
|
||||||
|
# ):
|
||||||
|
# timeout = litellm._calculate_retry_after(
|
||||||
|
# remaining_retries=num_retries,
|
||||||
|
# max_retries=num_retries,
|
||||||
|
# response_headers=original_exception.response.headers,
|
||||||
|
# min_timeout=self.retry_after,
|
||||||
|
# )
|
||||||
|
# else:
|
||||||
|
# timeout = litellm._calculate_retry_after(
|
||||||
|
# remaining_retries=num_retries,
|
||||||
|
# max_retries=num_retries,
|
||||||
|
# min_timeout=self.retry_after,
|
||||||
|
# )
|
||||||
|
# await asyncio.sleep(timeout)
|
||||||
|
# else:
|
||||||
|
# raise original_exception
|
||||||
|
|
||||||
|
### RETRY
|
||||||
|
_timeout = self._router_should_retry(
|
||||||
|
e=original_exception,
|
||||||
|
remaining_retries=num_retries,
|
||||||
|
num_retries=num_retries,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(_timeout)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
if num_retries > 0:
|
if num_retries > 0:
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
|
@ -1488,34 +1518,12 @@ class Router:
|
||||||
## LOGGING
|
## LOGGING
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=e)
|
kwargs = self.log_retry(kwargs=kwargs, e=e)
|
||||||
remaining_retries = num_retries - current_attempt
|
remaining_retries = num_retries - current_attempt
|
||||||
if "No models available" in str(e):
|
_timeout = self._router_should_retry(
|
||||||
timeout = litellm._calculate_retry_after(
|
e=original_exception,
|
||||||
remaining_retries=remaining_retries,
|
remaining_retries=remaining_retries,
|
||||||
max_retries=num_retries,
|
num_retries=num_retries,
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
)
|
||||||
await asyncio.sleep(timeout)
|
await asyncio.sleep(_timeout)
|
||||||
elif (
|
|
||||||
hasattr(e, "status_code")
|
|
||||||
and hasattr(e, "response")
|
|
||||||
and litellm._should_retry(status_code=e.status_code)
|
|
||||||
):
|
|
||||||
if hasattr(e.response, "headers"):
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=remaining_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
response_headers=e.response.headers,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=remaining_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
await asyncio.sleep(timeout)
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
def function_with_fallbacks(self, *args, **kwargs):
|
def function_with_fallbacks(self, *args, **kwargs):
|
||||||
|
@ -1606,6 +1614,27 @@ class Router:
|
||||||
raise e
|
raise e
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
|
def _router_should_retry(
|
||||||
|
self, e: Exception, remaining_retries: int, num_retries: int
|
||||||
|
) -> Union[int, float]:
|
||||||
|
"""
|
||||||
|
Calculate back-off, then retry
|
||||||
|
"""
|
||||||
|
if hasattr(e, "response") and hasattr(e.response, "headers"):
|
||||||
|
timeout = litellm._calculate_retry_after(
|
||||||
|
remaining_retries=remaining_retries,
|
||||||
|
max_retries=num_retries,
|
||||||
|
response_headers=e.response.headers,
|
||||||
|
min_timeout=self.retry_after,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
timeout = litellm._calculate_retry_after(
|
||||||
|
remaining_retries=remaining_retries,
|
||||||
|
max_retries=num_retries,
|
||||||
|
min_timeout=self.retry_after,
|
||||||
|
)
|
||||||
|
return timeout
|
||||||
|
|
||||||
def function_with_retries(self, *args, **kwargs):
|
def function_with_retries(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Try calling the model 3 times. Shuffle between available deployments.
|
Try calling the model 3 times. Shuffle between available deployments.
|
||||||
|
@ -1619,15 +1648,13 @@ class Router:
|
||||||
context_window_fallbacks = kwargs.pop(
|
context_window_fallbacks = kwargs.pop(
|
||||||
"context_window_fallbacks", self.context_window_fallbacks
|
"context_window_fallbacks", self.context_window_fallbacks
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
||||||
response = original_function(*args, **kwargs)
|
response = original_function(*args, **kwargs)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
original_exception = e
|
original_exception = e
|
||||||
verbose_router_logger.debug(
|
|
||||||
f"num retries in function with retries: {num_retries}"
|
|
||||||
)
|
|
||||||
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
|
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
|
||||||
if (
|
if (
|
||||||
isinstance(original_exception, litellm.ContextWindowExceededError)
|
isinstance(original_exception, litellm.ContextWindowExceededError)
|
||||||
|
@ -1641,6 +1668,12 @@ class Router:
|
||||||
if num_retries > 0:
|
if num_retries > 0:
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
### RETRY
|
### RETRY
|
||||||
|
_timeout = self._router_should_retry(
|
||||||
|
e=original_exception,
|
||||||
|
remaining_retries=num_retries,
|
||||||
|
num_retries=num_retries,
|
||||||
|
)
|
||||||
|
time.sleep(_timeout)
|
||||||
for current_attempt in range(num_retries):
|
for current_attempt in range(num_retries):
|
||||||
verbose_router_logger.debug(
|
verbose_router_logger.debug(
|
||||||
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
|
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
|
||||||
|
@ -1654,34 +1687,12 @@ class Router:
|
||||||
## LOGGING
|
## LOGGING
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=e)
|
kwargs = self.log_retry(kwargs=kwargs, e=e)
|
||||||
remaining_retries = num_retries - current_attempt
|
remaining_retries = num_retries - current_attempt
|
||||||
if "No models available" in str(e):
|
_timeout = self._router_should_retry(
|
||||||
timeout = litellm._calculate_retry_after(
|
e=e,
|
||||||
remaining_retries=remaining_retries,
|
remaining_retries=remaining_retries,
|
||||||
max_retries=num_retries,
|
num_retries=num_retries,
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
)
|
||||||
time.sleep(timeout)
|
time.sleep(_timeout)
|
||||||
elif (
|
|
||||||
hasattr(e, "status_code")
|
|
||||||
and hasattr(e, "response")
|
|
||||||
and litellm._should_retry(status_code=e.status_code)
|
|
||||||
):
|
|
||||||
if hasattr(e.response, "headers"):
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=remaining_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
response_headers=e.response.headers,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
timeout = litellm._calculate_retry_after(
|
|
||||||
remaining_retries=remaining_retries,
|
|
||||||
max_retries=num_retries,
|
|
||||||
min_timeout=self.retry_after,
|
|
||||||
)
|
|
||||||
time.sleep(timeout)
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
### HELPER FUNCTIONS
|
### HELPER FUNCTIONS
|
||||||
|
@ -1715,10 +1726,11 @@ class Router:
|
||||||
) # i.e. azure
|
) # i.e. azure
|
||||||
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
|
||||||
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
|
||||||
|
|
||||||
if isinstance(_model_info, dict):
|
if isinstance(_model_info, dict):
|
||||||
deployment_id = _model_info.get("id", None)
|
deployment_id = _model_info.get("id", None)
|
||||||
self._set_cooldown_deployments(
|
self._set_cooldown_deployments(
|
||||||
deployment_id
|
exception_status=exception_status, deployment=deployment_id
|
||||||
) # setting deployment_id in cooldown deployments
|
) # setting deployment_id in cooldown deployments
|
||||||
if custom_llm_provider:
|
if custom_llm_provider:
|
||||||
model_name = f"{custom_llm_provider}/{model_name}"
|
model_name = f"{custom_llm_provider}/{model_name}"
|
||||||
|
@ -1778,9 +1790,15 @@ class Router:
|
||||||
key=rpm_key, value=request_count, local_only=True
|
key=rpm_key, value=request_count, local_only=True
|
||||||
) # don't change existing ttl
|
) # don't change existing ttl
|
||||||
|
|
||||||
def _set_cooldown_deployments(self, deployment: Optional[str] = None):
|
def _set_cooldown_deployments(
|
||||||
|
self, exception_status: Union[str, int], deployment: Optional[str] = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
the exception is not one that should be immediately retried (e.g. 401)
|
||||||
"""
|
"""
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
return
|
return
|
||||||
|
@ -1797,7 +1815,20 @@ class Router:
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
||||||
)
|
)
|
||||||
cooldown_time = self.cooldown_time or 1
|
cooldown_time = self.cooldown_time or 1
|
||||||
if updated_fails > self.allowed_fails:
|
|
||||||
|
if isinstance(exception_status, str):
|
||||||
|
try:
|
||||||
|
exception_status = int(exception_status)
|
||||||
|
except Exception as e:
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"Unable to cast exception status to int {}. Defaulting to status=500.".format(
|
||||||
|
exception_status
|
||||||
|
)
|
||||||
|
)
|
||||||
|
exception_status = 500
|
||||||
|
_should_retry = litellm._should_retry(status_code=exception_status)
|
||||||
|
|
||||||
|
if updated_fails > self.allowed_fails or _should_retry == False:
|
||||||
# get the current cooldown list for that minute
|
# get the current cooldown list for that minute
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
cached_value = self.cache.get_cache(key=cooldown_key)
|
cached_value = self.cache.get_cache(key=cooldown_key)
|
||||||
|
@ -1929,6 +1960,7 @@ class Router:
|
||||||
)
|
)
|
||||||
default_api_base = api_base
|
default_api_base = api_base
|
||||||
default_api_key = api_key
|
default_api_key = api_key
|
||||||
|
|
||||||
if (
|
if (
|
||||||
model_name in litellm.open_ai_chat_completion_models
|
model_name in litellm.open_ai_chat_completion_models
|
||||||
or custom_llm_provider in litellm.openai_compatible_providers
|
or custom_llm_provider in litellm.openai_compatible_providers
|
||||||
|
@ -1940,8 +1972,10 @@ class Router:
|
||||||
or "ft:gpt-3.5-turbo" in model_name
|
or "ft:gpt-3.5-turbo" in model_name
|
||||||
or model_name in litellm.open_ai_embedding_models
|
or model_name in litellm.open_ai_embedding_models
|
||||||
):
|
):
|
||||||
|
is_azure_ai_studio_model: bool = False
|
||||||
if custom_llm_provider == "azure":
|
if custom_llm_provider == "azure":
|
||||||
if litellm.utils._is_non_openai_azure_model(model_name):
|
if litellm.utils._is_non_openai_azure_model(model_name):
|
||||||
|
is_azure_ai_studio_model = True
|
||||||
custom_llm_provider = "openai"
|
custom_llm_provider = "openai"
|
||||||
# remove azure prefx from model_name
|
# remove azure prefx from model_name
|
||||||
model_name = model_name.replace("azure/", "")
|
model_name = model_name.replace("azure/", "")
|
||||||
|
@ -1964,6 +1998,25 @@ class Router:
|
||||||
api_base = litellm.get_secret(api_base_env_name)
|
api_base = litellm.get_secret(api_base_env_name)
|
||||||
litellm_params["api_base"] = api_base
|
litellm_params["api_base"] = api_base
|
||||||
|
|
||||||
|
## AZURE AI STUDIO MISTRAL CHECK ##
|
||||||
|
"""
|
||||||
|
Make sure api base ends in /v1/
|
||||||
|
|
||||||
|
if not, add it - https://github.com/BerriAI/litellm/issues/2279
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
is_azure_ai_studio_model == True
|
||||||
|
and api_base is not None
|
||||||
|
and not api_base.endswith("/v1/")
|
||||||
|
):
|
||||||
|
# check if it ends with a trailing slash
|
||||||
|
if api_base.endswith("/"):
|
||||||
|
api_base += "v1/"
|
||||||
|
elif api_base.endswith("/v1"):
|
||||||
|
api_base += "/"
|
||||||
|
else:
|
||||||
|
api_base += "/v1/"
|
||||||
|
|
||||||
api_version = litellm_params.get("api_version")
|
api_version = litellm_params.get("api_version")
|
||||||
if api_version and api_version.startswith("os.environ/"):
|
if api_version and api_version.startswith("os.environ/"):
|
||||||
api_version_env_name = api_version.replace("os.environ/", "")
|
api_version_env_name = api_version.replace("os.environ/", "")
|
||||||
|
@ -1986,7 +2039,9 @@ class Router:
|
||||||
stream_timeout = litellm.get_secret(stream_timeout_env_name)
|
stream_timeout = litellm.get_secret(stream_timeout_env_name)
|
||||||
litellm_params["stream_timeout"] = stream_timeout
|
litellm_params["stream_timeout"] = stream_timeout
|
||||||
|
|
||||||
max_retries = litellm_params.pop("max_retries", 2)
|
max_retries = litellm_params.pop(
|
||||||
|
"max_retries", 0
|
||||||
|
) # router handles retry logic
|
||||||
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
|
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
|
||||||
max_retries_env_name = max_retries.replace("os.environ/", "")
|
max_retries_env_name = max_retries.replace("os.environ/", "")
|
||||||
max_retries = litellm.get_secret(max_retries_env_name)
|
max_retries = litellm.get_secret(max_retries_env_name)
|
||||||
|
@ -2052,10 +2107,12 @@ class Router:
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2074,10 +2131,12 @@ class Router:
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2096,10 +2155,12 @@ class Router:
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2118,10 +2179,12 @@ class Router:
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2158,10 +2221,12 @@ class Router:
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2178,10 +2243,12 @@ class Router:
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2199,10 +2266,12 @@ class Router:
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
@ -2219,10 +2288,12 @@ class Router:
|
||||||
timeout=stream_timeout,
|
timeout=stream_timeout,
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
@ -2249,10 +2320,12 @@ class Router:
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
organization=organization,
|
organization=organization,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2271,10 +2344,12 @@ class Router:
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
organization=organization,
|
organization=organization,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2294,10 +2369,12 @@ class Router:
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
organization=organization,
|
organization=organization,
|
||||||
http_client=httpx.AsyncClient(
|
http_client=httpx.AsyncClient(
|
||||||
transport=AsyncCustomHTTPTransport(),
|
transport=AsyncCustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=async_proxy_mounts,
|
mounts=async_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2317,10 +2394,12 @@ class Router:
|
||||||
max_retries=max_retries,
|
max_retries=max_retries,
|
||||||
organization=organization,
|
organization=organization,
|
||||||
http_client=httpx.Client(
|
http_client=httpx.Client(
|
||||||
transport=CustomHTTPTransport(),
|
transport=CustomHTTPTransport(
|
||||||
limits=httpx.Limits(
|
limits=httpx.Limits(
|
||||||
max_connections=1000, max_keepalive_connections=100
|
max_connections=1000, max_keepalive_connections=100
|
||||||
),
|
),
|
||||||
|
verify=litellm.ssl_verify,
|
||||||
|
),
|
||||||
mounts=sync_proxy_mounts,
|
mounts=sync_proxy_mounts,
|
||||||
), # type: ignore
|
), # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -2550,6 +2629,11 @@ class Router:
|
||||||
for var in vars_to_include:
|
for var in vars_to_include:
|
||||||
if var in _all_vars:
|
if var in _all_vars:
|
||||||
_settings_to_return[var] = _all_vars[var]
|
_settings_to_return[var] = _all_vars[var]
|
||||||
|
if (
|
||||||
|
var == "routing_strategy_args"
|
||||||
|
and self.routing_strategy == "latency-based-routing"
|
||||||
|
):
|
||||||
|
_settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
|
||||||
return _settings_to_return
|
return _settings_to_return
|
||||||
|
|
||||||
def update_settings(self, **kwargs):
|
def update_settings(self, **kwargs):
|
||||||
|
@ -2581,6 +2665,13 @@ class Router:
|
||||||
_casted_value = int(kwargs[var])
|
_casted_value = int(kwargs[var])
|
||||||
setattr(self, var, _casted_value)
|
setattr(self, var, _casted_value)
|
||||||
else:
|
else:
|
||||||
|
if var == "routing_strategy":
|
||||||
|
self.routing_strategy_init(
|
||||||
|
routing_strategy=kwargs[var],
|
||||||
|
routing_strategy_args=kwargs.get(
|
||||||
|
"routing_strategy_args", {}
|
||||||
|
),
|
||||||
|
)
|
||||||
setattr(self, var, kwargs[var])
|
setattr(self, var, kwargs[var])
|
||||||
else:
|
else:
|
||||||
verbose_router_logger.debug("Setting {} is not allowed".format(var))
|
verbose_router_logger.debug("Setting {} is not allowed".format(var))
|
||||||
|
@ -2717,7 +2808,10 @@ class Router:
|
||||||
self.cache.get_cache(key=model_id, local_only=True) or 0
|
self.cache.get_cache(key=model_id, local_only=True) or 0
|
||||||
)
|
)
|
||||||
### get usage based cache ###
|
### get usage based cache ###
|
||||||
if isinstance(model_group_cache, dict):
|
if (
|
||||||
|
isinstance(model_group_cache, dict)
|
||||||
|
and self.routing_strategy != "usage-based-routing-v2"
|
||||||
|
):
|
||||||
model_group_cache[model_id] = model_group_cache.get(model_id, 0)
|
model_group_cache[model_id] = model_group_cache.get(model_id, 0)
|
||||||
|
|
||||||
current_request = max(
|
current_request = max(
|
||||||
|
@ -2745,7 +2839,7 @@ class Router:
|
||||||
|
|
||||||
if _rate_limit_error == True: # allow generic fallback logic to take place
|
if _rate_limit_error == True: # allow generic fallback logic to take place
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No deployments available for selected model, passed model={model}"
|
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
|
||||||
)
|
)
|
||||||
elif _context_window_error == True:
|
elif _context_window_error == True:
|
||||||
raise litellm.ContextWindowExceededError(
|
raise litellm.ContextWindowExceededError(
|
||||||
|
@ -2883,6 +2977,11 @@ class Router:
|
||||||
model=model, healthy_deployments=healthy_deployments, messages=messages
|
model=model, healthy_deployments=healthy_deployments, messages=messages
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if len(healthy_deployments) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.routing_strategy == "usage-based-routing-v2"
|
self.routing_strategy == "usage-based-routing-v2"
|
||||||
and self.lowesttpm_logger_v2 is not None
|
and self.lowesttpm_logger_v2 is not None
|
||||||
|
@ -2938,7 +3037,7 @@ class Router:
|
||||||
f"get_available_deployment for model: {model}, No deployment available"
|
f"get_available_deployment for model: {model}, No deployment available"
|
||||||
)
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No deployments available for selected model, passed model={model}"
|
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
|
||||||
)
|
)
|
||||||
verbose_router_logger.info(
|
verbose_router_logger.info(
|
||||||
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
|
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
|
||||||
|
@ -3068,7 +3167,7 @@ class Router:
|
||||||
f"get_available_deployment for model: {model}, No deployment available"
|
f"get_available_deployment for model: {model}, No deployment available"
|
||||||
)
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No deployments available for selected model, passed model={model}"
|
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
|
||||||
)
|
)
|
||||||
verbose_router_logger.info(
|
verbose_router_logger.info(
|
||||||
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
|
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
|
||||||
|
|
|
@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
|
||||||
import dotenv, os, requests, random
|
import dotenv, os, requests, random
|
||||||
from typing import Optional, Union, List, Dict
|
from typing import Optional, Union, List, Dict
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
import random
|
||||||
|
|
||||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||||
import traceback
|
import traceback
|
||||||
|
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):
|
||||||
|
|
||||||
class RoutingArgs(LiteLLMBase):
|
class RoutingArgs(LiteLLMBase):
|
||||||
ttl: int = 1 * 60 * 60 # 1 hour
|
ttl: int = 1 * 60 * 60 # 1 hour
|
||||||
|
lowest_latency_buffer: float = 0
|
||||||
|
|
||||||
|
|
||||||
class LowestLatencyLoggingHandler(CustomLogger):
|
class LowestLatencyLoggingHandler(CustomLogger):
|
||||||
|
@ -312,6 +314,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
||||||
except:
|
except:
|
||||||
input_tokens = 0
|
input_tokens = 0
|
||||||
|
|
||||||
|
# randomly sample from all_deployments, incase all deployments have latency=0.0
|
||||||
|
_items = all_deployments.items()
|
||||||
|
|
||||||
|
all_deployments = random.sample(list(_items), len(_items))
|
||||||
|
all_deployments = dict(all_deployments)
|
||||||
|
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
|
||||||
|
|
||||||
|
potential_deployments = []
|
||||||
for item, item_map in all_deployments.items():
|
for item, item_map in all_deployments.items():
|
||||||
## get the item from model list
|
## get the item from model list
|
||||||
_deployment = None
|
_deployment = None
|
||||||
|
@ -345,23 +355,48 @@ class LowestLatencyLoggingHandler(CustomLogger):
|
||||||
if isinstance(_call_latency, float):
|
if isinstance(_call_latency, float):
|
||||||
total += _call_latency
|
total += _call_latency
|
||||||
item_latency = total / len(item_latency)
|
item_latency = total / len(item_latency)
|
||||||
if item_latency == 0:
|
|
||||||
deployment = _deployment
|
# -------------- #
|
||||||
break
|
# Debugging Logic
|
||||||
elif (
|
# -------------- #
|
||||||
|
# We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
|
||||||
|
# this helps a user to debug why the router picked a specfic deployment #
|
||||||
|
_deployment_api_base = _deployment.get("litellm_params", {}).get(
|
||||||
|
"api_base", ""
|
||||||
|
)
|
||||||
|
if _deployment_api_base is not None:
|
||||||
|
_latency_per_deployment[_deployment_api_base] = item_latency
|
||||||
|
# -------------- #
|
||||||
|
# End of Debugging Logic
|
||||||
|
# -------------- #
|
||||||
|
|
||||||
|
if (
|
||||||
item_tpm + input_tokens > _deployment_tpm
|
item_tpm + input_tokens > _deployment_tpm
|
||||||
or item_rpm + 1 > _deployment_rpm
|
or item_rpm + 1 > _deployment_rpm
|
||||||
): # if user passed in tpm / rpm in the model_list
|
): # if user passed in tpm / rpm in the model_list
|
||||||
continue
|
continue
|
||||||
elif item_latency < lowest_latency:
|
else:
|
||||||
lowest_latency = item_latency
|
potential_deployments.append((_deployment, item_latency))
|
||||||
deployment = _deployment
|
|
||||||
|
if len(potential_deployments) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Sort potential deployments by latency
|
||||||
|
sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
|
||||||
|
|
||||||
|
# Find lowest latency deployment
|
||||||
|
lowest_latency = sorted_deployments[0][1]
|
||||||
|
|
||||||
|
# Find deployments within buffer of lowest latency
|
||||||
|
buffer = self.routing_args.lowest_latency_buffer * lowest_latency
|
||||||
|
valid_deployments = [
|
||||||
|
x for x in sorted_deployments if x[1] <= lowest_latency + buffer
|
||||||
|
]
|
||||||
|
|
||||||
|
# Pick a random deployment from valid deployments
|
||||||
|
random_valid_deployment = random.choice(valid_deployments)
|
||||||
|
deployment = random_valid_deployment[0]
|
||||||
|
|
||||||
# _latency_per_deployment is used for debuggig
|
|
||||||
_deployment_api_base = _deployment.get("litellm_params", {}).get(
|
|
||||||
"api_base", ""
|
|
||||||
)
|
|
||||||
_latency_per_deployment[_deployment_api_base] = item_latency
|
|
||||||
if request_kwargs is not None and "metadata" in request_kwargs:
|
if request_kwargs is not None and "metadata" in request_kwargs:
|
||||||
request_kwargs["metadata"][
|
request_kwargs["metadata"][
|
||||||
"_latency_per_deployment"
|
"_latency_per_deployment"
|
||||||
|
|
|
@ -206,7 +206,7 @@ class LowestTPMLoggingHandler(CustomLogger):
|
||||||
if item_tpm + input_tokens > _deployment_tpm:
|
if item_tpm + input_tokens > _deployment_tpm:
|
||||||
continue
|
continue
|
||||||
elif (rpm_dict is not None and item in rpm_dict) and (
|
elif (rpm_dict is not None and item in rpm_dict) and (
|
||||||
rpm_dict[item] + 1 > _deployment_rpm
|
rpm_dict[item] + 1 >= _deployment_rpm
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
elif item_tpm < lowest_tpm:
|
elif item_tpm < lowest_tpm:
|
||||||
|
|
|
@ -333,7 +333,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
tpm_dict[tpm_key] = 0
|
tpm_dict[tpm_key] = 0
|
||||||
|
|
||||||
all_deployments = tpm_dict
|
all_deployments = tpm_dict
|
||||||
deployment = None
|
potential_deployments = [] # if multiple deployments have the same low value
|
||||||
for item, item_tpm in all_deployments.items():
|
for item, item_tpm in all_deployments.items():
|
||||||
## get the item from model list
|
## get the item from model list
|
||||||
_deployment = None
|
_deployment = None
|
||||||
|
@ -343,6 +343,8 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
_deployment = m
|
_deployment = m
|
||||||
if _deployment is None:
|
if _deployment is None:
|
||||||
continue # skip to next one
|
continue # skip to next one
|
||||||
|
elif item_tpm is None:
|
||||||
|
continue # skip if unhealthy deployment
|
||||||
|
|
||||||
_deployment_tpm = None
|
_deployment_tpm = None
|
||||||
if _deployment_tpm is None:
|
if _deployment_tpm is None:
|
||||||
|
@ -366,14 +368,20 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
if item_tpm + input_tokens > _deployment_tpm:
|
if item_tpm + input_tokens > _deployment_tpm:
|
||||||
continue
|
continue
|
||||||
elif (rpm_dict is not None and item in rpm_dict) and (
|
elif (rpm_dict is not None and item in rpm_dict) and (
|
||||||
rpm_dict[item] + 1 > _deployment_rpm
|
rpm_dict[item] + 1 >= _deployment_rpm
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
elif item_tpm == lowest_tpm:
|
||||||
|
potential_deployments.append(_deployment)
|
||||||
elif item_tpm < lowest_tpm:
|
elif item_tpm < lowest_tpm:
|
||||||
lowest_tpm = item_tpm
|
lowest_tpm = item_tpm
|
||||||
deployment = _deployment
|
potential_deployments = [_deployment]
|
||||||
print_verbose("returning picked lowest tpm/rpm deployment.")
|
print_verbose("returning picked lowest tpm/rpm deployment.")
|
||||||
return deployment
|
|
||||||
|
if len(potential_deployments) > 0:
|
||||||
|
return random.choice(potential_deployments)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
async def async_get_available_deployments(
|
async def async_get_available_deployments(
|
||||||
self,
|
self,
|
||||||
|
@ -394,6 +402,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
|
|
||||||
dt = get_utc_datetime()
|
dt = get_utc_datetime()
|
||||||
current_minute = dt.strftime("%H-%M")
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
|
||||||
tpm_keys = []
|
tpm_keys = []
|
||||||
rpm_keys = []
|
rpm_keys = []
|
||||||
for m in healthy_deployments:
|
for m in healthy_deployments:
|
||||||
|
@ -416,7 +425,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
|
tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
|
||||||
rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]
|
rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]
|
||||||
|
|
||||||
return self._common_checks_available_deployment(
|
deployment = self._common_checks_available_deployment(
|
||||||
model_group=model_group,
|
model_group=model_group,
|
||||||
healthy_deployments=healthy_deployments,
|
healthy_deployments=healthy_deployments,
|
||||||
tpm_keys=tpm_keys,
|
tpm_keys=tpm_keys,
|
||||||
|
@ -427,6 +436,61 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
input=input,
|
input=input,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert deployment is not None
|
||||||
|
return deployment
|
||||||
|
except Exception as e:
|
||||||
|
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
|
||||||
|
deployment_dict = {}
|
||||||
|
for index, _deployment in enumerate(healthy_deployments):
|
||||||
|
if isinstance(_deployment, dict):
|
||||||
|
id = _deployment.get("model_info", {}).get("id")
|
||||||
|
### GET DEPLOYMENT TPM LIMIT ###
|
||||||
|
_deployment_tpm = None
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("tpm", None)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("litellm_params", {}).get(
|
||||||
|
"tpm", None
|
||||||
|
)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("model_info", {}).get(
|
||||||
|
"tpm", None
|
||||||
|
)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = float("inf")
|
||||||
|
|
||||||
|
### GET CURRENT TPM ###
|
||||||
|
current_tpm = tpm_values[index]
|
||||||
|
|
||||||
|
### GET DEPLOYMENT TPM LIMIT ###
|
||||||
|
_deployment_rpm = None
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("rpm", None)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("litellm_params", {}).get(
|
||||||
|
"rpm", None
|
||||||
|
)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("model_info", {}).get(
|
||||||
|
"rpm", None
|
||||||
|
)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = float("inf")
|
||||||
|
|
||||||
|
### GET CURRENT RPM ###
|
||||||
|
current_rpm = rpm_values[index]
|
||||||
|
|
||||||
|
deployment_dict[id] = {
|
||||||
|
"current_tpm": current_tpm,
|
||||||
|
"tpm_limit": _deployment_tpm,
|
||||||
|
"current_rpm": current_rpm,
|
||||||
|
"rpm_limit": _deployment_rpm,
|
||||||
|
}
|
||||||
|
raise ValueError(
|
||||||
|
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
|
||||||
|
)
|
||||||
|
|
||||||
def get_available_deployments(
|
def get_available_deployments(
|
||||||
self,
|
self,
|
||||||
model_group: str,
|
model_group: str,
|
||||||
|
@ -464,7 +528,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
keys=rpm_keys
|
keys=rpm_keys
|
||||||
) # [1, 2, None, ..]
|
) # [1, 2, None, ..]
|
||||||
|
|
||||||
return self._common_checks_available_deployment(
|
deployment = self._common_checks_available_deployment(
|
||||||
model_group=model_group,
|
model_group=model_group,
|
||||||
healthy_deployments=healthy_deployments,
|
healthy_deployments=healthy_deployments,
|
||||||
tpm_keys=tpm_keys,
|
tpm_keys=tpm_keys,
|
||||||
|
@ -474,3 +538,58 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
|
||||||
messages=messages,
|
messages=messages,
|
||||||
input=input,
|
input=input,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert deployment is not None
|
||||||
|
return deployment
|
||||||
|
except Exception as e:
|
||||||
|
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
|
||||||
|
deployment_dict = {}
|
||||||
|
for index, _deployment in enumerate(healthy_deployments):
|
||||||
|
if isinstance(_deployment, dict):
|
||||||
|
id = _deployment.get("model_info", {}).get("id")
|
||||||
|
### GET DEPLOYMENT TPM LIMIT ###
|
||||||
|
_deployment_tpm = None
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("tpm", None)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("litellm_params", {}).get(
|
||||||
|
"tpm", None
|
||||||
|
)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = _deployment.get("model_info", {}).get(
|
||||||
|
"tpm", None
|
||||||
|
)
|
||||||
|
if _deployment_tpm is None:
|
||||||
|
_deployment_tpm = float("inf")
|
||||||
|
|
||||||
|
### GET CURRENT TPM ###
|
||||||
|
current_tpm = tpm_values[index]
|
||||||
|
|
||||||
|
### GET DEPLOYMENT TPM LIMIT ###
|
||||||
|
_deployment_rpm = None
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("rpm", None)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("litellm_params", {}).get(
|
||||||
|
"rpm", None
|
||||||
|
)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = _deployment.get("model_info", {}).get(
|
||||||
|
"rpm", None
|
||||||
|
)
|
||||||
|
if _deployment_rpm is None:
|
||||||
|
_deployment_rpm = float("inf")
|
||||||
|
|
||||||
|
### GET CURRENT RPM ###
|
||||||
|
current_rpm = rpm_values[index]
|
||||||
|
|
||||||
|
deployment_dict[id] = {
|
||||||
|
"current_tpm": current_tpm,
|
||||||
|
"tpm_limit": _deployment_tpm,
|
||||||
|
"current_rpm": current_rpm,
|
||||||
|
"rpm_limit": _deployment_rpm,
|
||||||
|
}
|
||||||
|
raise ValueError(
|
||||||
|
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
|
||||||
|
)
|
||||||
|
|
|
@ -19,6 +19,7 @@ def setup_and_teardown():
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the project directory to the system path
|
) # Adds the project directory to the system path
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
importlib.reload(litellm)
|
importlib.reload(litellm)
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
|
@ -119,7 +119,9 @@ def test_multiple_deployments_parallel():
|
||||||
|
|
||||||
|
|
||||||
# test_multiple_deployments_parallel()
|
# test_multiple_deployments_parallel()
|
||||||
def test_cooldown_same_model_name():
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cooldown_same_model_name(sync_mode):
|
||||||
# users could have the same model with different api_base
|
# users could have the same model with different api_base
|
||||||
# example
|
# example
|
||||||
# azure/chatgpt, api_base: 1234
|
# azure/chatgpt, api_base: 1234
|
||||||
|
@ -161,6 +163,7 @@ def test_cooldown_same_model_name():
|
||||||
num_retries=3,
|
num_retries=3,
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
response = router.completion(
|
response = router.completion(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[{"role": "user", "content": "hello this request will pass"}],
|
messages=[{"role": "user", "content": "hello this request will pass"}],
|
||||||
|
@ -176,6 +179,23 @@ def test_cooldown_same_model_name():
|
||||||
model_ids[0] != model_ids[1]
|
model_ids[0] != model_ids[1]
|
||||||
) # ensure both models have a uuid added, and they have different names
|
) # ensure both models have a uuid added, and they have different names
|
||||||
|
|
||||||
|
print("\ngot response\n", response)
|
||||||
|
else:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "hello this request will pass"}],
|
||||||
|
)
|
||||||
|
print(router.model_list)
|
||||||
|
model_ids = []
|
||||||
|
for model in router.model_list:
|
||||||
|
model_ids.append(model["model_info"]["id"])
|
||||||
|
print("\n litellm model ids ", model_ids)
|
||||||
|
|
||||||
|
# example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
|
||||||
|
assert (
|
||||||
|
model_ids[0] != model_ids[1]
|
||||||
|
) # ensure both models have a uuid added, and they have different names
|
||||||
|
|
||||||
print("\ngot response\n", response)
|
print("\ngot response\n", response)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Got unexpected exception on router! - {e}")
|
pytest.fail(f"Got unexpected exception on router! - {e}")
|
||||||
|
|
|
@ -161,40 +161,56 @@ async def make_async_calls():
|
||||||
return total_time
|
return total_time
|
||||||
|
|
||||||
|
|
||||||
# def test_langfuse_logging_async_text_completion():
|
@pytest.mark.asyncio
|
||||||
# try:
|
@pytest.mark.parametrize("stream", [False, True])
|
||||||
# pre_langfuse_setup()
|
async def test_langfuse_logging_without_request_response(stream):
|
||||||
# litellm.set_verbose = False
|
try:
|
||||||
# litellm.success_callback = ["langfuse"]
|
import uuid
|
||||||
|
|
||||||
# async def _test_langfuse():
|
_unique_trace_name = f"litellm-test-{str(uuid.uuid4())}"
|
||||||
# response = await litellm.atext_completion(
|
litellm.set_verbose = True
|
||||||
# model="gpt-3.5-turbo-instruct",
|
litellm.turn_off_message_logging = True
|
||||||
# prompt="this is a test",
|
litellm.success_callback = ["langfuse"]
|
||||||
# max_tokens=5,
|
response = await litellm.acompletion(
|
||||||
# temperature=0.7,
|
model="gpt-3.5-turbo",
|
||||||
# timeout=5,
|
mock_response="It's simple to use and easy to get started",
|
||||||
# user="test_user",
|
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
|
||||||
# stream=True
|
max_tokens=10,
|
||||||
# )
|
temperature=0.2,
|
||||||
# async for chunk in response:
|
stream=stream,
|
||||||
# print()
|
metadata={"trace_id": _unique_trace_name},
|
||||||
# print(chunk)
|
)
|
||||||
# await asyncio.sleep(1)
|
print(response)
|
||||||
# return response
|
if stream:
|
||||||
|
async for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
|
||||||
# response = asyncio.run(_test_langfuse())
|
await asyncio.sleep(3)
|
||||||
# print(f"response: {response}")
|
|
||||||
|
|
||||||
# # # check langfuse.log to see if there was a failed response
|
import langfuse
|
||||||
# search_logs("langfuse.log")
|
|
||||||
# except litellm.Timeout as e:
|
|
||||||
# pass
|
|
||||||
# except Exception as e:
|
|
||||||
# pytest.fail(f"An exception occurred - {e}")
|
|
||||||
|
|
||||||
|
langfuse_client = langfuse.Langfuse(
|
||||||
|
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
|
||||||
|
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
|
||||||
|
)
|
||||||
|
|
||||||
# test_langfuse_logging_async_text_completion()
|
# get trace with _unique_trace_name
|
||||||
|
trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
|
||||||
|
|
||||||
|
print("trace_from_langfuse", trace)
|
||||||
|
|
||||||
|
_trace_data = trace.data
|
||||||
|
|
||||||
|
assert _trace_data[0].input == {"messages": "redacted-by-litellm"}
|
||||||
|
assert _trace_data[0].output == {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "redacted-by-litellm",
|
||||||
|
"function_call": None,
|
||||||
|
"tool_calls": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"An exception occurred - {e}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="beta test - checking langfuse output")
|
@pytest.mark.skip(reason="beta test - checking langfuse output")
|
||||||
|
@ -334,6 +350,220 @@ def test_langfuse_logging_function_calling():
|
||||||
# test_langfuse_logging_function_calling()
|
# test_langfuse_logging_function_calling()
|
||||||
|
|
||||||
|
|
||||||
|
def test_langfuse_existing_trace_id():
|
||||||
|
"""
|
||||||
|
When existing trace id is passed, don't set trace params -> prevents overwriting the trace
|
||||||
|
|
||||||
|
Pass 1 logging object with a trace
|
||||||
|
|
||||||
|
Pass 2nd logging object with the trace id
|
||||||
|
|
||||||
|
Assert no changes to the trace
|
||||||
|
"""
|
||||||
|
# Test - if the logs were sent to the correct team on langfuse
|
||||||
|
import litellm, datetime
|
||||||
|
from litellm.integrations.langfuse import LangFuseLogger
|
||||||
|
|
||||||
|
langfuse_Logger = LangFuseLogger(
|
||||||
|
langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
|
||||||
|
langfuse_secret=os.getenv("LANGFUSE_PROJECT2_SECRET"),
|
||||||
|
)
|
||||||
|
litellm.success_callback = ["langfuse"]
|
||||||
|
|
||||||
|
# langfuse_args = {'kwargs': { 'start_time': 'end_time': datetime.datetime(2024, 5, 1, 7, 31, 29, 903685), 'user_id': None, 'print_verbose': <function print_verbose at 0x109d1f420>, 'level': 'DEFAULT', 'status_message': None}
|
||||||
|
response_obj = litellm.ModelResponse(
|
||||||
|
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
|
||||||
|
choices=[
|
||||||
|
litellm.Choices(
|
||||||
|
finish_reason="stop",
|
||||||
|
index=0,
|
||||||
|
message=litellm.Message(
|
||||||
|
content="I'm sorry, I am an AI assistant and do not have real-time information. I recommend checking a reliable weather website or app for the most up-to-date weather information in Boston.",
|
||||||
|
role="assistant",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=1714573888,
|
||||||
|
model="gpt-3.5-turbo-0125",
|
||||||
|
object="chat.completion",
|
||||||
|
system_fingerprint="fp_3b956da36b",
|
||||||
|
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
|
||||||
|
)
|
||||||
|
|
||||||
|
### NEW TRACE ###
|
||||||
|
message = [{"role": "user", "content": "what's the weather in boston"}]
|
||||||
|
langfuse_args = {
|
||||||
|
"response_obj": response_obj,
|
||||||
|
"kwargs": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"acompletion": False,
|
||||||
|
"api_key": None,
|
||||||
|
"force_timeout": 600,
|
||||||
|
"logger_fn": None,
|
||||||
|
"verbose": False,
|
||||||
|
"custom_llm_provider": "openai",
|
||||||
|
"api_base": "https://api.openai.com/v1/",
|
||||||
|
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
|
||||||
|
"model_alias_map": {},
|
||||||
|
"completion_call_id": None,
|
||||||
|
"metadata": None,
|
||||||
|
"model_info": None,
|
||||||
|
"proxy_server_request": None,
|
||||||
|
"preset_cache_key": None,
|
||||||
|
"no-log": False,
|
||||||
|
"stream_response": {},
|
||||||
|
},
|
||||||
|
"messages": message,
|
||||||
|
"optional_params": {"temperature": 0.1, "extra_body": {}},
|
||||||
|
"start_time": "2024-05-01 07:31:27.986164",
|
||||||
|
"stream": False,
|
||||||
|
"user": None,
|
||||||
|
"call_type": "completion",
|
||||||
|
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
|
||||||
|
"completion_start_time": "2024-05-01 07:31:29.903685",
|
||||||
|
"temperature": 0.1,
|
||||||
|
"extra_body": {},
|
||||||
|
"input": [{"role": "user", "content": "what's the weather in boston"}],
|
||||||
|
"api_key": "my-api-key",
|
||||||
|
"additional_args": {
|
||||||
|
"complete_input_dict": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "what's the weather in boston"}
|
||||||
|
],
|
||||||
|
"temperature": 0.1,
|
||||||
|
"extra_body": {},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"log_event_type": "successful_api_call",
|
||||||
|
"end_time": "2024-05-01 07:31:29.903685",
|
||||||
|
"cache_hit": None,
|
||||||
|
"response_cost": 6.25e-05,
|
||||||
|
},
|
||||||
|
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
|
||||||
|
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
|
||||||
|
"user_id": None,
|
||||||
|
"print_verbose": litellm.print_verbose,
|
||||||
|
"level": "DEFAULT",
|
||||||
|
"status_message": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
|
||||||
|
|
||||||
|
import langfuse
|
||||||
|
|
||||||
|
langfuse_client = langfuse.Langfuse(
|
||||||
|
public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
|
||||||
|
secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
|
||||||
|
)
|
||||||
|
|
||||||
|
trace_id = langfuse_response_object["trace_id"]
|
||||||
|
|
||||||
|
langfuse_client.flush()
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print(langfuse_client.get_trace(id=trace_id))
|
||||||
|
|
||||||
|
initial_langfuse_trace = langfuse_client.get_trace(id=trace_id)
|
||||||
|
|
||||||
|
### EXISTING TRACE ###
|
||||||
|
|
||||||
|
new_metadata = {"existing_trace_id": trace_id}
|
||||||
|
new_messages = [{"role": "user", "content": "What do you know?"}]
|
||||||
|
new_response_obj = litellm.ModelResponse(
|
||||||
|
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
|
||||||
|
choices=[
|
||||||
|
litellm.Choices(
|
||||||
|
finish_reason="stop",
|
||||||
|
index=0,
|
||||||
|
message=litellm.Message(
|
||||||
|
content="What do I know?",
|
||||||
|
role="assistant",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=1714573888,
|
||||||
|
model="gpt-3.5-turbo-0125",
|
||||||
|
object="chat.completion",
|
||||||
|
system_fingerprint="fp_3b956da36b",
|
||||||
|
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
|
||||||
|
)
|
||||||
|
langfuse_args = {
|
||||||
|
"response_obj": new_response_obj,
|
||||||
|
"kwargs": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"acompletion": False,
|
||||||
|
"api_key": None,
|
||||||
|
"force_timeout": 600,
|
||||||
|
"logger_fn": None,
|
||||||
|
"verbose": False,
|
||||||
|
"custom_llm_provider": "openai",
|
||||||
|
"api_base": "https://api.openai.com/v1/",
|
||||||
|
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
|
||||||
|
"model_alias_map": {},
|
||||||
|
"completion_call_id": None,
|
||||||
|
"metadata": new_metadata,
|
||||||
|
"model_info": None,
|
||||||
|
"proxy_server_request": None,
|
||||||
|
"preset_cache_key": None,
|
||||||
|
"no-log": False,
|
||||||
|
"stream_response": {},
|
||||||
|
},
|
||||||
|
"messages": new_messages,
|
||||||
|
"optional_params": {"temperature": 0.1, "extra_body": {}},
|
||||||
|
"start_time": "2024-05-01 07:31:27.986164",
|
||||||
|
"stream": False,
|
||||||
|
"user": None,
|
||||||
|
"call_type": "completion",
|
||||||
|
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
|
||||||
|
"completion_start_time": "2024-05-01 07:31:29.903685",
|
||||||
|
"temperature": 0.1,
|
||||||
|
"extra_body": {},
|
||||||
|
"input": [{"role": "user", "content": "what's the weather in boston"}],
|
||||||
|
"api_key": "my-api-key",
|
||||||
|
"additional_args": {
|
||||||
|
"complete_input_dict": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "what's the weather in boston"}
|
||||||
|
],
|
||||||
|
"temperature": 0.1,
|
||||||
|
"extra_body": {},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"log_event_type": "successful_api_call",
|
||||||
|
"end_time": "2024-05-01 07:31:29.903685",
|
||||||
|
"cache_hit": None,
|
||||||
|
"response_cost": 6.25e-05,
|
||||||
|
},
|
||||||
|
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
|
||||||
|
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
|
||||||
|
"user_id": None,
|
||||||
|
"print_verbose": litellm.print_verbose,
|
||||||
|
"level": "DEFAULT",
|
||||||
|
"status_message": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
|
||||||
|
|
||||||
|
new_trace_id = langfuse_response_object["trace_id"]
|
||||||
|
|
||||||
|
assert new_trace_id == trace_id
|
||||||
|
|
||||||
|
langfuse_client.flush()
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print(langfuse_client.get_trace(id=trace_id))
|
||||||
|
|
||||||
|
new_langfuse_trace = langfuse_client.get_trace(id=trace_id)
|
||||||
|
|
||||||
|
assert dict(initial_langfuse_trace) == dict(new_langfuse_trace)
|
||||||
|
|
||||||
|
|
||||||
def test_langfuse_logging_tool_calling():
|
def test_langfuse_logging_tool_calling():
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
|
|
@ -68,6 +68,7 @@ async def test_get_api_base():
|
||||||
await _pl.alerting_handler(
|
await _pl.alerting_handler(
|
||||||
message=slow_message + request_info,
|
message=slow_message + request_info,
|
||||||
level="Low",
|
level="Low",
|
||||||
|
alert_type="llm_too_slow",
|
||||||
)
|
)
|
||||||
print("passed test_get_api_base")
|
print("passed test_get_api_base")
|
||||||
|
|
||||||
|
|
|
@ -394,6 +394,8 @@ async def test_async_vertexai_response():
|
||||||
pass
|
pass
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
pass
|
pass
|
||||||
|
except litellm.APIError as e:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred: {e}")
|
pytest.fail(f"An exception occurred: {e}")
|
||||||
|
|
||||||
|
@ -636,7 +638,10 @@ def test_gemini_pro_function_calling():
|
||||||
# gemini_pro_function_calling()
|
# gemini_pro_function_calling()
|
||||||
|
|
||||||
|
|
||||||
def test_gemini_pro_function_calling_streaming():
|
@pytest.mark.parametrize("stream", [False, True])
|
||||||
|
@pytest.mark.parametrize("sync_mode", [False, True])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_gemini_pro_function_calling_streaming(stream, sync_mode):
|
||||||
load_vertex_ai_credentials()
|
load_vertex_ai_credentials()
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
tools = [
|
tools = [
|
||||||
|
@ -665,19 +670,41 @@ def test_gemini_pro_function_calling_streaming():
|
||||||
"content": "What's the weather like in Boston today in fahrenheit?",
|
"content": "What's the weather like in Boston today in fahrenheit?",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
optional_params = {
|
||||||
|
"tools": tools,
|
||||||
|
"tool_choice": "auto",
|
||||||
|
"n": 1,
|
||||||
|
"stream": stream,
|
||||||
|
"temperature": 0.1,
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
completion = litellm.completion(
|
if sync_mode == True:
|
||||||
model="gemini-pro",
|
response = litellm.completion(
|
||||||
messages=messages,
|
model="gemini-pro", messages=messages, **optional_params
|
||||||
tools=tools,
|
|
||||||
tool_choice="auto",
|
|
||||||
stream=True,
|
|
||||||
)
|
)
|
||||||
print(f"completion: {completion}")
|
print(f"completion: {response}")
|
||||||
|
|
||||||
|
if stream == True:
|
||||||
# assert completion.choices[0].message.content is None
|
# assert completion.choices[0].message.content is None
|
||||||
# assert len(completion.choices[0].message.tool_calls) == 1
|
# assert len(completion.choices[0].message.tool_calls) == 1
|
||||||
for chunk in completion:
|
for chunk in response:
|
||||||
|
assert isinstance(chunk, litellm.ModelResponse)
|
||||||
|
else:
|
||||||
|
assert isinstance(response, litellm.ModelResponse)
|
||||||
|
else:
|
||||||
|
response = await litellm.acompletion(
|
||||||
|
model="gemini-pro", messages=messages, **optional_params
|
||||||
|
)
|
||||||
|
print(f"completion: {response}")
|
||||||
|
|
||||||
|
if stream == True:
|
||||||
|
# assert completion.choices[0].message.content is None
|
||||||
|
# assert len(completion.choices[0].message.tool_calls) == 1
|
||||||
|
async for chunk in response:
|
||||||
print(f"chunk: {chunk}")
|
print(f"chunk: {chunk}")
|
||||||
|
assert isinstance(chunk, litellm.ModelResponse)
|
||||||
|
else:
|
||||||
|
assert isinstance(response, litellm.ModelResponse)
|
||||||
except litellm.APIError as e:
|
except litellm.APIError as e:
|
||||||
pass
|
pass
|
||||||
except litellm.RateLimitError as e:
|
except litellm.RateLimitError as e:
|
||||||
|
|
|
@ -57,7 +57,7 @@ def test_completion_custom_provider_model_name():
|
||||||
messages=messages,
|
messages=messages,
|
||||||
logger_fn=logger_fn,
|
logger_fn=logger_fn,
|
||||||
)
|
)
|
||||||
# Add any assertions here to, check the response
|
# Add any assertions here to,check the response
|
||||||
print(response)
|
print(response)
|
||||||
print(response["choices"][0]["finish_reason"])
|
print(response["choices"][0]["finish_reason"])
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
|
@ -231,6 +231,76 @@ def test_completion_claude_3_function_call():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_cohere_command_r_plus_function_call():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_weather",
|
||||||
|
"description": "Get the current weather in a given location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What's the weather like in Boston today in Fahrenheit?",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
# test without max tokens
|
||||||
|
response = completion(
|
||||||
|
model="command-r-plus",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
# Add any assertions, here to check response args
|
||||||
|
print(response)
|
||||||
|
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
|
||||||
|
assert isinstance(
|
||||||
|
response.choices[0].message.tool_calls[0].function.arguments, str
|
||||||
|
)
|
||||||
|
|
||||||
|
messages.append(
|
||||||
|
response.choices[0].message.model_dump()
|
||||||
|
) # Add assistant tool invokes
|
||||||
|
tool_result = (
|
||||||
|
'{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
|
||||||
|
)
|
||||||
|
# Add user submitted tool results in the OpenAI format
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"tool_call_id": response.choices[0].message.tool_calls[0].id,
|
||||||
|
"role": "tool",
|
||||||
|
"name": response.choices[0].message.tool_calls[0].function.name,
|
||||||
|
"content": tool_result,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# In the second response, Cohere should deduce answer from tool results
|
||||||
|
second_response = completion(
|
||||||
|
model="command-r-plus",
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice="auto",
|
||||||
|
)
|
||||||
|
print(second_response)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
def test_parse_xml_params():
|
def test_parse_xml_params():
|
||||||
from litellm.llms.prompt_templates.factory import parse_xml_params
|
from litellm.llms.prompt_templates.factory import parse_xml_params
|
||||||
|
|
||||||
|
@ -1291,6 +1361,7 @@ def test_completion_logprobs_stream():
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
# check if atleast one chunk has log probs
|
# check if atleast one chunk has log probs
|
||||||
print(chunk)
|
print(chunk)
|
||||||
|
print(f"chunk.choices[0]: {chunk.choices[0]}")
|
||||||
if "logprobs" in chunk.choices[0]:
|
if "logprobs" in chunk.choices[0]:
|
||||||
# assert we got a valid logprob in the choices
|
# assert we got a valid logprob in the choices
|
||||||
assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
|
assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
|
||||||
|
@ -1781,7 +1852,6 @@ def test_completion_replicate_llama3():
|
||||||
print("RESPONSE STRING\n", response_str)
|
print("RESPONSE STRING\n", response_str)
|
||||||
if type(response_str) != str:
|
if type(response_str) != str:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
raise Exception("it worked!")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -2655,6 +2725,88 @@ def test_completion_palm_stream():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_watsonx():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
model_name = "watsonx/ibm/granite-13b-chat-v2"
|
||||||
|
try:
|
||||||
|
response = completion(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
stop=["stop"],
|
||||||
|
max_tokens=20,
|
||||||
|
)
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
print(response)
|
||||||
|
except litellm.APIError as e:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"provider, model, project, region_name, token",
|
||||||
|
[
|
||||||
|
("azure", "chatgpt-v-2", None, None, "test-token"),
|
||||||
|
("vertex_ai", "anthropic-claude-3", "adroit-crow-1", "us-east1", None),
|
||||||
|
("watsonx", "ibm/granite", "96946574", "dallas", "1234"),
|
||||||
|
("bedrock", "anthropic.claude-3", None, "us-east-1", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_unified_auth_params(provider, model, project, region_name, token):
|
||||||
|
"""
|
||||||
|
Check if params = ["project", "region_name", "token"]
|
||||||
|
are correctly translated for = ["azure", "vertex_ai", "watsonx", "aws"]
|
||||||
|
|
||||||
|
tests get_optional_params
|
||||||
|
"""
|
||||||
|
data = {
|
||||||
|
"project": project,
|
||||||
|
"region_name": region_name,
|
||||||
|
"token": token,
|
||||||
|
"custom_llm_provider": provider,
|
||||||
|
"model": model,
|
||||||
|
}
|
||||||
|
|
||||||
|
translated_optional_params = litellm.utils.get_optional_params(**data)
|
||||||
|
|
||||||
|
if provider == "azure":
|
||||||
|
special_auth_params = (
|
||||||
|
litellm.AzureOpenAIConfig().get_mapped_special_auth_params()
|
||||||
|
)
|
||||||
|
elif provider == "bedrock":
|
||||||
|
special_auth_params = (
|
||||||
|
litellm.AmazonBedrockGlobalConfig().get_mapped_special_auth_params()
|
||||||
|
)
|
||||||
|
elif provider == "vertex_ai":
|
||||||
|
special_auth_params = litellm.VertexAIConfig().get_mapped_special_auth_params()
|
||||||
|
elif provider == "watsonx":
|
||||||
|
special_auth_params = (
|
||||||
|
litellm.IBMWatsonXAIConfig().get_mapped_special_auth_params()
|
||||||
|
)
|
||||||
|
|
||||||
|
for param, value in special_auth_params.items():
|
||||||
|
assert param in data
|
||||||
|
assert value in translated_optional_params
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_acompletion_watsonx():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
model_name = "watsonx/ibm/granite-13b-chat-v2"
|
||||||
|
print("testing watsonx")
|
||||||
|
try:
|
||||||
|
response = await litellm.acompletion(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
temperature=0.2,
|
||||||
|
max_tokens=80,
|
||||||
|
)
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
print(response)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
# test_completion_palm_stream()
|
# test_completion_palm_stream()
|
||||||
|
|
||||||
# test_completion_deep_infra()
|
# test_completion_deep_infra()
|
||||||
|
|
|
@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
|
||||||
completion_response=response, call_type="image_generation"
|
completion_response=response, call_type="image_generation"
|
||||||
)
|
)
|
||||||
assert cost > 0
|
assert cost > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_replicate_llama3_cost_tracking():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
model = "replicate/meta/meta-llama-3-8b-instruct"
|
||||||
|
litellm.register_model(
|
||||||
|
{
|
||||||
|
"replicate/meta/meta-llama-3-8b-instruct": {
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response = litellm.ModelResponse(
|
||||||
|
id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
|
||||||
|
choices=[
|
||||||
|
litellm.utils.Choices(
|
||||||
|
finish_reason="stop",
|
||||||
|
index=0,
|
||||||
|
message=litellm.utils.Message(
|
||||||
|
content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
|
||||||
|
role="assistant",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=1714401369,
|
||||||
|
model="replicate/meta/meta-llama-3-8b-instruct",
|
||||||
|
object="chat.completion",
|
||||||
|
system_fingerprint=None,
|
||||||
|
usage=litellm.utils.Usage(
|
||||||
|
prompt_tokens=48, completion_tokens=31, total_tokens=79
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cost = litellm.completion_cost(
|
||||||
|
completion_response=response,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"cost: {cost}")
|
||||||
|
cost = round(cost, 5)
|
||||||
|
expected_cost = round(
|
||||||
|
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
|
||||||
|
"input_cost_per_token"
|
||||||
|
]
|
||||||
|
* 48
|
||||||
|
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
|
||||||
|
"output_cost_per_token"
|
||||||
|
]
|
||||||
|
* 31,
|
||||||
|
5,
|
||||||
|
)
|
||||||
|
assert cost == expected_cost
|
||||||
|
|
|
@ -26,6 +26,9 @@ class DBModel(BaseModel):
|
||||||
model_info: dict
|
model_info: dict
|
||||||
litellm_params: dict
|
litellm_params: dict
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_delete_deployment():
|
async def test_delete_deployment():
|
||||||
|
|
|
@ -529,6 +529,7 @@ def test_chat_bedrock_stream():
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_async_chat_bedrock_stream():
|
async def test_async_chat_bedrock_stream():
|
||||||
try:
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
customHandler = CompletionCustomHandler()
|
customHandler = CompletionCustomHandler()
|
||||||
litellm.callbacks = [customHandler]
|
litellm.callbacks = [customHandler]
|
||||||
response = await litellm.acompletion(
|
response = await litellm.acompletion(
|
||||||
|
|
|
@ -484,6 +484,20 @@ def test_mistral_embeddings():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="local test")
|
||||||
|
def test_watsonx_embeddings():
|
||||||
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
|
response = litellm.embedding(
|
||||||
|
model="watsonx/ibm/slate-30m-english-rtrvr",
|
||||||
|
input=["good morning from litellm"],
|
||||||
|
)
|
||||||
|
print(f"response: {response}")
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
# test_mistral_embeddings()
|
# test_mistral_embeddings()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ def test_empty_content():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
function_setup(
|
function_setup(
|
||||||
original_function=completion,
|
original_function="completion",
|
||||||
rules_obj=rules_obj,
|
rules_obj=rules_obj,
|
||||||
start_time=datetime.now(),
|
start_time=datetime.now(),
|
||||||
messages=[],
|
messages=[],
|
||||||
|
|
|
@ -136,8 +136,8 @@ def test_image_generation_bedrock():
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
response = litellm.image_generation(
|
response = litellm.image_generation(
|
||||||
prompt="A cute baby sea otter",
|
prompt="A cute baby sea otter",
|
||||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
model="bedrock/stability.stable-diffusion-xl-v1",
|
||||||
aws_region_name="us-east-1",
|
aws_region_name="us-west-2",
|
||||||
)
|
)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
except litellm.RateLimitError as e:
|
except litellm.RateLimitError as e:
|
||||||
|
@ -156,8 +156,8 @@ async def test_aimage_generation_bedrock_with_optional_params():
|
||||||
try:
|
try:
|
||||||
response = await litellm.aimage_generation(
|
response = await litellm.aimage_generation(
|
||||||
prompt="A cute baby sea otter",
|
prompt="A cute baby sea otter",
|
||||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
model="bedrock/stability.stable-diffusion-xl-v1",
|
||||||
size="128x128",
|
size="256x256",
|
||||||
)
|
)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
except litellm.RateLimitError as e:
|
except litellm.RateLimitError as e:
|
||||||
|
|
|
@ -201,6 +201,7 @@ async def test_router_atext_completion_streaming():
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_router_completion_streaming():
|
async def test_router_completion_streaming():
|
||||||
|
litellm.set_verbose = True
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "user", "content": "Hello, can you generate a 500 words poem?"}
|
{"role": "user", "content": "Hello, can you generate a 500 words poem?"}
|
||||||
]
|
]
|
||||||
|
@ -219,9 +220,9 @@ async def test_router_completion_streaming():
|
||||||
{
|
{
|
||||||
"model_name": "azure-model",
|
"model_name": "azure-model",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "azure/gpt-35-turbo",
|
"model": "azure/gpt-turbo",
|
||||||
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
"rpm": 6,
|
"rpm": 6,
|
||||||
},
|
},
|
||||||
"model_info": {"id": 2},
|
"model_info": {"id": 2},
|
||||||
|
@ -229,9 +230,9 @@ async def test_router_completion_streaming():
|
||||||
{
|
{
|
||||||
"model_name": "azure-model",
|
"model_name": "azure-model",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "azure/gpt-35-turbo",
|
"model": "azure/gpt-turbo",
|
||||||
"api_key": "os.environ/AZURE_CANADA_API_KEY",
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
"rpm": 6,
|
"rpm": 6,
|
||||||
},
|
},
|
||||||
"model_info": {"id": 3},
|
"model_info": {"id": 3},
|
||||||
|
@ -262,4 +263,4 @@ async def test_router_completion_streaming():
|
||||||
## check if calls equally distributed
|
## check if calls equally distributed
|
||||||
cache_dict = router.cache.get_cache(key=cache_key)
|
cache_dict = router.cache.get_cache(key=cache_key)
|
||||||
for k, v in cache_dict.items():
|
for k, v in cache_dict.items():
|
||||||
assert v == 1
|
assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"
|
||||||
|
|
|
@ -555,3 +555,171 @@ async def test_lowest_latency_routing_with_timeouts():
|
||||||
|
|
||||||
# ALL the Requests should have been routed to the fast-endpoint
|
# ALL the Requests should have been routed to the fast-endpoint
|
||||||
assert deployments["fast-endpoint"] == 10
|
assert deployments["fast-endpoint"] == 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_lowest_latency_routing_first_pick():
|
||||||
|
"""
|
||||||
|
PROD Test:
|
||||||
|
- When all deployments are latency=0, it should randomly pick a deployment
|
||||||
|
- IT SHOULD NEVER PICK THE Very First deployment everytime all deployment latencies are 0
|
||||||
|
- This ensures that after the ttl window resets it randomly picks a deployment
|
||||||
|
"""
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/fast-endpoint",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "fast-endpoint"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/fast-endpoint-2",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "fast-endpoint-2"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/fast-endpoint-2",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "fast-endpoint-3"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "openai/fast-endpoint-2",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"api_key": "fake-key",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "fast-endpoint-4"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
routing_strategy="latency-based-routing",
|
||||||
|
routing_strategy_args={"ttl": 0.0000000001},
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
deployments = {}
|
||||||
|
for _ in range(5):
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure-model", messages=[{"role": "user", "content": "hello"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
_picked_model_id = response._hidden_params["model_id"]
|
||||||
|
if _picked_model_id not in deployments:
|
||||||
|
deployments[_picked_model_id] = 1
|
||||||
|
else:
|
||||||
|
deployments[_picked_model_id] += 1
|
||||||
|
await asyncio.sleep(0.000000000005)
|
||||||
|
|
||||||
|
print("deployments", deployments)
|
||||||
|
|
||||||
|
# assert that len(deployments) >1
|
||||||
|
assert len(deployments) > 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("buffer", [0, 1])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_lowest_latency_routing_buffer(buffer):
|
||||||
|
"""
|
||||||
|
Allow shuffling calls within a certain latency buffer
|
||||||
|
"""
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-turbo",
|
||||||
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
|
"rpm": 1440,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-35-turbo",
|
||||||
|
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
|
||||||
|
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
|
||||||
|
"rpm": 6,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 2},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
routing_strategy="latency-based-routing",
|
||||||
|
set_verbose=False,
|
||||||
|
num_retries=3,
|
||||||
|
routing_strategy_args={"lowest_latency_buffer": buffer},
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
## DEPLOYMENT 1 ##
|
||||||
|
deployment_id = 1
|
||||||
|
kwargs = {
|
||||||
|
"litellm_params": {
|
||||||
|
"metadata": {
|
||||||
|
"model_group": "azure-model",
|
||||||
|
},
|
||||||
|
"model_info": {"id": 1},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start_time = time.time()
|
||||||
|
response_obj = {"usage": {"total_tokens": 50}}
|
||||||
|
time.sleep(3)
|
||||||
|
end_time = time.time()
|
||||||
|
router.lowestlatency_logger.log_success_event(
|
||||||
|
response_obj=response_obj,
|
||||||
|
kwargs=kwargs,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
## DEPLOYMENT 2 ##
|
||||||
|
deployment_id = 2
|
||||||
|
kwargs = {
|
||||||
|
"litellm_params": {
|
||||||
|
"metadata": {
|
||||||
|
"model_group": "azure-model",
|
||||||
|
},
|
||||||
|
"model_info": {"id": 2},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start_time = time.time()
|
||||||
|
response_obj = {"usage": {"total_tokens": 20}}
|
||||||
|
time.sleep(2)
|
||||||
|
end_time = time.time()
|
||||||
|
router.lowestlatency_logger.log_success_event(
|
||||||
|
response_obj=response_obj,
|
||||||
|
kwargs=kwargs,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
## CHECK WHAT'S SELECTED ##
|
||||||
|
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
|
||||||
|
selected_deployments = {}
|
||||||
|
for _ in range(50):
|
||||||
|
print(router.get_available_deployment(model="azure-model"))
|
||||||
|
selected_deployments[
|
||||||
|
router.get_available_deployment(model="azure-model")["model_info"]["id"]
|
||||||
|
] = 1
|
||||||
|
|
||||||
|
if buffer == 0:
|
||||||
|
assert len(selected_deployments.keys()) == 1
|
||||||
|
else:
|
||||||
|
assert len(selected_deployments.keys()) == 2
|
||||||
|
|
10
litellm/tests/test_pydantic_namespaces.py
Normal file
10
litellm/tests/test_pydantic_namespaces.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import warnings
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
def test_namespace_conflict_warning():
|
||||||
|
with warnings.catch_warnings(record=True) as recorded_warnings:
|
||||||
|
warnings.simplefilter("always") # Capture all warnings
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
# Check that no warning with the specific message was raised
|
||||||
|
assert not any("conflict with protected namespace" in str(w.message) for w in recorded_warnings), "Test failed: 'conflict with protected namespace' warning was encountered!"
|
|
@ -1,7 +1,7 @@
|
||||||
#### What this tests ####
|
#### What this tests ####
|
||||||
# This tests litellm router
|
# This tests litellm router
|
||||||
|
|
||||||
import sys, os, time
|
import sys, os, time, openai
|
||||||
import traceback, asyncio
|
import traceback, asyncio
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -14,10 +14,169 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
import os, httpx
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("num_retries", [None, 2])
|
||||||
|
@pytest.mark.parametrize("max_retries", [None, 4])
|
||||||
|
def test_router_num_retries_init(num_retries, max_retries):
|
||||||
|
"""
|
||||||
|
- test when num_retries set v/s not
|
||||||
|
- test client value when max retries set v/s not
|
||||||
|
"""
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": "bad-key",
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"max_retries": max_retries,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 12345},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
num_retries=num_retries,
|
||||||
|
)
|
||||||
|
|
||||||
|
if num_retries is not None:
|
||||||
|
assert router.num_retries == num_retries
|
||||||
|
else:
|
||||||
|
assert router.num_retries == openai.DEFAULT_MAX_RETRIES
|
||||||
|
|
||||||
|
model_client = router._get_client(
|
||||||
|
{"model_info": {"id": 12345}}, client_type="async", kwargs={}
|
||||||
|
)
|
||||||
|
|
||||||
|
if max_retries is not None:
|
||||||
|
assert getattr(model_client, "max_retries") == max_retries
|
||||||
|
else:
|
||||||
|
assert getattr(model_client, "max_retries") == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("ssl_verify", [True, False])
|
||||||
|
def test_router_timeout_init(timeout, ssl_verify):
|
||||||
|
"""
|
||||||
|
Allow user to pass httpx.Timeout
|
||||||
|
|
||||||
|
related issue - https://github.com/BerriAI/litellm/issues/3162
|
||||||
|
"""
|
||||||
|
litellm.ssl_verify = ssl_verify
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "test-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"timeout": timeout,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 1234},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model_client = router._get_client(
|
||||||
|
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert getattr(model_client, "timeout") == timeout
|
||||||
|
|
||||||
|
print(f"vars model_client: {vars(model_client)}")
|
||||||
|
http_client = getattr(model_client, "_client")
|
||||||
|
print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
|
||||||
|
if ssl_verify == False:
|
||||||
|
assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
http_client._transport._pool._ssl_context.verify_mode.name
|
||||||
|
== "CERT_REQUIRED"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [False, True])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_retries(sync_mode):
|
||||||
|
"""
|
||||||
|
- make sure retries work as expected
|
||||||
|
"""
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad-key"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
router = Router(model_list=model_list, num_retries=2)
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
router.completion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mistral_api_base",
|
||||||
|
[
|
||||||
|
"os.environ/AZURE_MISTRAL_API_BASE",
|
||||||
|
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/",
|
||||||
|
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1",
|
||||||
|
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/",
|
||||||
|
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_router_azure_ai_studio_init(mistral_api_base):
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "test-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/mistral-large-latest",
|
||||||
|
"api_key": "os.environ/AZURE_MISTRAL_API_KEY",
|
||||||
|
"api_base": mistral_api_base,
|
||||||
|
},
|
||||||
|
"model_info": {"id": 1234},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model_client = router._get_client(
|
||||||
|
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
|
||||||
|
)
|
||||||
|
url = getattr(model_client, "_base_url")
|
||||||
|
uri_reference = str(getattr(url, "_uri_reference"))
|
||||||
|
|
||||||
|
print(f"uri_reference: {uri_reference}")
|
||||||
|
|
||||||
|
assert "/v1/" in uri_reference
|
||||||
|
assert uri_reference.count("v1") == 1
|
||||||
|
|
||||||
|
|
||||||
def test_exception_raising():
|
def test_exception_raising():
|
||||||
# this tests if the router raises an exception when invalid params are set
|
# this tests if the router raises an exception when invalid params are set
|
||||||
# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
|
# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
|
||||||
|
@ -995,6 +1154,7 @@ def test_consistent_model_id():
|
||||||
assert id1 == id2
|
assert id1 == id2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="local test")
|
||||||
def test_reading_keys_os_environ():
|
def test_reading_keys_os_environ():
|
||||||
import openai
|
import openai
|
||||||
|
|
||||||
|
@ -1094,6 +1254,7 @@ def test_reading_keys_os_environ():
|
||||||
# test_reading_keys_os_environ()
|
# test_reading_keys_os_environ()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="local test")
|
||||||
def test_reading_openai_keys_os_environ():
|
def test_reading_openai_keys_os_environ():
|
||||||
import openai
|
import openai
|
||||||
|
|
||||||
|
|
|
@ -46,6 +46,7 @@ def test_async_fallbacks(caplog):
|
||||||
router = Router(
|
router = Router(
|
||||||
model_list=model_list,
|
model_list=model_list,
|
||||||
fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
|
fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
|
||||||
|
num_retries=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
user_message = "Hello, how are you?"
|
user_message = "Hello, how are you?"
|
||||||
|
@ -81,8 +82,8 @@ def test_async_fallbacks(caplog):
|
||||||
# Define the expected log messages
|
# Define the expected log messages
|
||||||
# - error request, falling back notice, success notice
|
# - error request, falling back notice, success notice
|
||||||
expected_logs = [
|
expected_logs = [
|
||||||
"Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
|
|
||||||
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
|
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
|
||||||
|
"litellm.acompletion(model=None)\x1b[31m Exception No deployments available for selected model, passed model=gpt-3.5-turbo\x1b[0m",
|
||||||
"Falling back to model_group = azure/gpt-3.5-turbo",
|
"Falling back to model_group = azure/gpt-3.5-turbo",
|
||||||
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
|
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
|
||||||
]
|
]
|
||||||
|
|
|
@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger):
|
||||||
def log_pre_api_call(self, model, messages, kwargs):
|
def log_pre_api_call(self, model, messages, kwargs):
|
||||||
print(f"Pre-API Call")
|
print(f"Pre-API Call")
|
||||||
print(
|
print(
|
||||||
f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
|
f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
|
||||||
)
|
)
|
||||||
self.previous_models += len(
|
self.previous_models = len(
|
||||||
kwargs["litellm_params"]["metadata"]["previous_models"]
|
kwargs["litellm_params"]["metadata"].get("previous_models", [])
|
||||||
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
|
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
|
||||||
print(f"self.previous_models: {self.previous_models}")
|
print(f"self.previous_models: {self.previous_models}")
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ def test_sync_fallbacks():
|
||||||
response = router.completion(**kwargs)
|
response = router.completion(**kwargs)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4
|
||||||
|
|
||||||
print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
|
print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
|
||||||
router.reset()
|
router.reset()
|
||||||
|
@ -140,7 +140,7 @@ def test_sync_fallbacks():
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_async_fallbacks():
|
async def test_async_fallbacks():
|
||||||
litellm.set_verbose = False
|
litellm.set_verbose = True
|
||||||
model_list = [
|
model_list = [
|
||||||
{ # list of model deployments
|
{ # list of model deployments
|
||||||
"model_name": "azure/gpt-3.5-turbo", # openai model name
|
"model_name": "azure/gpt-3.5-turbo", # openai model name
|
||||||
|
@ -209,12 +209,13 @@ async def test_async_fallbacks():
|
||||||
user_message = "Hello, how are you?"
|
user_message = "Hello, how are you?"
|
||||||
messages = [{"content": user_message, "role": "user"}]
|
messages = [{"content": user_message, "role": "user"}]
|
||||||
try:
|
try:
|
||||||
|
kwargs["model"] = "azure/gpt-3.5-turbo"
|
||||||
response = await router.acompletion(**kwargs)
|
response = await router.acompletion(**kwargs)
|
||||||
print(f"customHandler.previous_models: {customHandler.previous_models}")
|
print(f"customHandler.previous_models: {customHandler.previous_models}")
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
0.05
|
0.05
|
||||||
) # allow a delay as success_callbacks are on a separate thread
|
) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
pass
|
pass
|
||||||
|
@ -268,7 +269,7 @@ def test_sync_fallbacks_embeddings():
|
||||||
response = router.embedding(**kwargs)
|
response = router.embedding(**kwargs)
|
||||||
print(f"customHandler.previous_models: {customHandler.previous_models}")
|
print(f"customHandler.previous_models: {customHandler.previous_models}")
|
||||||
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
pass
|
pass
|
||||||
|
@ -322,7 +323,7 @@ async def test_async_fallbacks_embeddings():
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
0.05
|
0.05
|
||||||
) # allow a delay as success_callbacks are on a separate thread
|
) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
pass
|
pass
|
||||||
|
@ -401,7 +402,7 @@ def test_dynamic_fallbacks_sync():
|
||||||
response = router.completion(**kwargs)
|
response = router.completion(**kwargs)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred - {e}")
|
pytest.fail(f"An exception occurred - {e}")
|
||||||
|
@ -487,7 +488,7 @@ async def test_dynamic_fallbacks_async():
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
0.05
|
0.05
|
||||||
) # allow a delay as success_callbacks are on a separate thread
|
) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred - {e}")
|
pytest.fail(f"An exception occurred - {e}")
|
||||||
|
@ -572,7 +573,7 @@ async def test_async_fallbacks_streaming():
|
||||||
await asyncio.sleep(
|
await asyncio.sleep(
|
||||||
0.05
|
0.05
|
||||||
) # allow a delay as success_callbacks are on a separate thread
|
) # allow a delay as success_callbacks are on a separate thread
|
||||||
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
|
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
|
||||||
router.reset()
|
router.reset()
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
pass
|
pass
|
||||||
|
@ -751,7 +752,7 @@ async def test_async_fallbacks_max_retries_per_request():
|
||||||
router.reset()
|
router.reset()
|
||||||
|
|
||||||
|
|
||||||
def test_usage_based_routing_fallbacks():
|
def test_ausage_based_routing_fallbacks():
|
||||||
try:
|
try:
|
||||||
# [Prod Test]
|
# [Prod Test]
|
||||||
# IT tests Usage Based Routing with fallbacks
|
# IT tests Usage Based Routing with fallbacks
|
||||||
|
@ -765,10 +766,10 @@ def test_usage_based_routing_fallbacks():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# Constants for TPM and RPM allocation
|
# Constants for TPM and RPM allocation
|
||||||
AZURE_FAST_TPM = 3
|
AZURE_FAST_RPM = 1
|
||||||
AZURE_BASIC_TPM = 4
|
AZURE_BASIC_RPM = 1
|
||||||
OPENAI_TPM = 400
|
OPENAI_RPM = 2
|
||||||
ANTHROPIC_TPM = 100000
|
ANTHROPIC_RPM = 100000
|
||||||
|
|
||||||
def get_azure_params(deployment_name: str):
|
def get_azure_params(deployment_name: str):
|
||||||
params = {
|
params = {
|
||||||
|
@ -797,22 +798,26 @@ def test_usage_based_routing_fallbacks():
|
||||||
{
|
{
|
||||||
"model_name": "azure/gpt-4-fast",
|
"model_name": "azure/gpt-4-fast",
|
||||||
"litellm_params": get_azure_params("chatgpt-v-2"),
|
"litellm_params": get_azure_params("chatgpt-v-2"),
|
||||||
"tpm": AZURE_FAST_TPM,
|
"model_info": {"id": 1},
|
||||||
|
"rpm": AZURE_FAST_RPM,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model_name": "azure/gpt-4-basic",
|
"model_name": "azure/gpt-4-basic",
|
||||||
"litellm_params": get_azure_params("chatgpt-v-2"),
|
"litellm_params": get_azure_params("chatgpt-v-2"),
|
||||||
"tpm": AZURE_BASIC_TPM,
|
"model_info": {"id": 2},
|
||||||
|
"rpm": AZURE_BASIC_RPM,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model_name": "openai-gpt-4",
|
"model_name": "openai-gpt-4",
|
||||||
"litellm_params": get_openai_params("gpt-3.5-turbo"),
|
"litellm_params": get_openai_params("gpt-3.5-turbo"),
|
||||||
"tpm": OPENAI_TPM,
|
"model_info": {"id": 3},
|
||||||
|
"rpm": OPENAI_RPM,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model_name": "anthropic-claude-instant-1.2",
|
"model_name": "anthropic-claude-instant-1.2",
|
||||||
"litellm_params": get_anthropic_params("claude-instant-1.2"),
|
"litellm_params": get_anthropic_params("claude-instant-1.2"),
|
||||||
"tpm": ANTHROPIC_TPM,
|
"model_info": {"id": 4},
|
||||||
|
"rpm": ANTHROPIC_RPM,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
# litellm.set_verbose=True
|
# litellm.set_verbose=True
|
||||||
|
@ -830,6 +835,7 @@ def test_usage_based_routing_fallbacks():
|
||||||
routing_strategy="usage-based-routing",
|
routing_strategy="usage-based-routing",
|
||||||
redis_host=os.environ["REDIS_HOST"],
|
redis_host=os.environ["REDIS_HOST"],
|
||||||
redis_port=os.environ["REDIS_PORT"],
|
redis_port=os.environ["REDIS_PORT"],
|
||||||
|
num_retries=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
|
@ -842,10 +848,10 @@ def test_usage_based_routing_fallbacks():
|
||||||
mock_response="very nice to meet you",
|
mock_response="very nice to meet you",
|
||||||
)
|
)
|
||||||
print("response: ", response)
|
print("response: ", response)
|
||||||
print("response._hidden_params: ", response._hidden_params)
|
print(f"response._hidden_params: {response._hidden_params}")
|
||||||
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
|
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
|
||||||
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
|
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
|
||||||
assert response._hidden_params["custom_llm_provider"] == "openai"
|
assert response._hidden_params["model_id"] == "1"
|
||||||
|
|
||||||
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
|
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
|
||||||
for i in range(20):
|
for i in range(20):
|
||||||
|
@ -859,7 +865,7 @@ def test_usage_based_routing_fallbacks():
|
||||||
print("response._hidden_params: ", response._hidden_params)
|
print("response._hidden_params: ", response._hidden_params)
|
||||||
if i == 19:
|
if i == 19:
|
||||||
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
|
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
|
||||||
assert response._hidden_params["custom_llm_provider"] == "anthropic"
|
assert response._hidden_params["model_id"] == "4"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"An exception occurred {e}")
|
pytest.fail(f"An exception occurred {e}")
|
||||||
|
|
|
@ -203,7 +203,7 @@ def test_timeouts_router():
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
router = Router(model_list=model_list)
|
router = Router(model_list=model_list, num_retries=0)
|
||||||
|
|
||||||
print("PASSED !")
|
print("PASSED !")
|
||||||
|
|
||||||
|
@ -396,7 +396,9 @@ def test_router_init_gpt_4_vision_enhancements():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
def test_openai_with_organization():
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_openai_with_organization(sync_mode):
|
||||||
try:
|
try:
|
||||||
print("Testing OpenAI with organization")
|
print("Testing OpenAI with organization")
|
||||||
model_list = [
|
model_list = [
|
||||||
|
@ -418,6 +420,7 @@ def test_openai_with_organization():
|
||||||
print(router.model_list)
|
print(router.model_list)
|
||||||
print(router.model_list[0])
|
print(router.model_list[0])
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
openai_client = router._get_client(
|
openai_client = router._get_client(
|
||||||
deployment=router.model_list[0],
|
deployment=router.model_list[0],
|
||||||
kwargs={"input": ["hello"], "model": "openai-bad-org"},
|
kwargs={"input": ["hello"], "model": "openai-bad-org"},
|
||||||
|
@ -433,7 +436,9 @@ def test_openai_with_organization():
|
||||||
model="openai-bad-org",
|
model="openai-bad-org",
|
||||||
messages=[{"role": "user", "content": "this is a test"}],
|
messages=[{"role": "user", "content": "this is a test"}],
|
||||||
)
|
)
|
||||||
pytest.fail("Request should have failed - This organization does not exist")
|
pytest.fail(
|
||||||
|
"Request should have failed - This organization does not exist"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Got exception: " + str(e))
|
print("Got exception: " + str(e))
|
||||||
assert "No such organization: org-ikDc4ex8NB" in str(e)
|
assert "No such organization: org-ikDc4ex8NB" in str(e)
|
||||||
|
@ -444,6 +449,36 @@ def test_openai_with_organization():
|
||||||
messages=[{"role": "user", "content": "this is a test"}],
|
messages=[{"role": "user", "content": "this is a test"}],
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
openai_client = router._get_client(
|
||||||
|
deployment=router.model_list[0],
|
||||||
|
kwargs={"input": ["hello"], "model": "openai-bad-org"},
|
||||||
|
client_type="async",
|
||||||
|
)
|
||||||
|
print(vars(openai_client))
|
||||||
|
|
||||||
|
assert openai_client.organization == "org-ikDc4ex8NB"
|
||||||
|
|
||||||
|
# bad org raises error
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="openai-bad-org",
|
||||||
|
messages=[{"role": "user", "content": "this is a test"}],
|
||||||
|
)
|
||||||
|
pytest.fail(
|
||||||
|
"Request should have failed - This organization does not exist"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print("Got exception: " + str(e))
|
||||||
|
assert "No such organization: org-ikDc4ex8NB" in str(e)
|
||||||
|
|
||||||
|
# good org works
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="openai-good-org",
|
||||||
|
messages=[{"role": "user", "content": "this is a test"}],
|
||||||
|
max_tokens=5,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
121
litellm/tests/test_router_retries.py
Normal file
121
litellm/tests/test_router_retries.py
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
#### What this tests ####
|
||||||
|
# This tests calling router with fallback models
|
||||||
|
|
||||||
|
import sys, os, time
|
||||||
|
import traceback, asyncio
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
|
||||||
|
|
||||||
|
class MyCustomHandler(CustomLogger):
|
||||||
|
success: bool = False
|
||||||
|
failure: bool = False
|
||||||
|
previous_models: int = 0
|
||||||
|
|
||||||
|
def log_pre_api_call(self, model, messages, kwargs):
|
||||||
|
print(f"Pre-API Call")
|
||||||
|
print(
|
||||||
|
f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
|
||||||
|
)
|
||||||
|
self.previous_models = len(
|
||||||
|
kwargs["litellm_params"]["metadata"].get("previous_models", [])
|
||||||
|
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
|
||||||
|
print(f"self.previous_models: {self.previous_models}")
|
||||||
|
|
||||||
|
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(
|
||||||
|
f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Stream")
|
||||||
|
|
||||||
|
def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Stream")
|
||||||
|
|
||||||
|
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Success")
|
||||||
|
|
||||||
|
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Success")
|
||||||
|
|
||||||
|
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||||
|
print(f"On Failure")
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Test sync + async
|
||||||
|
|
||||||
|
- Authorization Errors
|
||||||
|
- Random API Error
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_retries_errors(sync_mode, error_type):
|
||||||
|
"""
|
||||||
|
- Auth Error -> 0 retries
|
||||||
|
- API Error -> 2 retries
|
||||||
|
"""
|
||||||
|
|
||||||
|
_api_key = (
|
||||||
|
"bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY")
|
||||||
|
)
|
||||||
|
print(f"_api_key: {_api_key}")
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure/gpt-3.5-turbo", # openai model name
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-functioncalling",
|
||||||
|
"api_key": _api_key,
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 240000,
|
||||||
|
"rpm": 1800,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
router = Router(model_list=model_list, allowed_fails=3)
|
||||||
|
|
||||||
|
customHandler = MyCustomHandler()
|
||||||
|
litellm.callbacks = [customHandler]
|
||||||
|
user_message = "Hello, how are you?"
|
||||||
|
messages = [{"content": user_message, "role": "user"}]
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"model": "azure/gpt-3.5-turbo",
|
||||||
|
"messages": messages,
|
||||||
|
"mock_response": (
|
||||||
|
None
|
||||||
|
if error_type == "Authorization Error"
|
||||||
|
else Exception("Invalid Request")
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if sync_mode:
|
||||||
|
response = router.completion(**kwargs)
|
||||||
|
else:
|
||||||
|
response = await router.acompletion(**kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
await asyncio.sleep(
|
||||||
|
0.05
|
||||||
|
) # allow a delay as success_callbacks are on a separate thread
|
||||||
|
print(f"customHandler.previous_models: {customHandler.previous_models}")
|
||||||
|
|
||||||
|
if error_type == "Authorization Error":
|
||||||
|
assert customHandler.previous_models == 0 # 0 retries
|
||||||
|
else:
|
||||||
|
assert customHandler.previous_models == 2 # 2 retries
|
|
@ -57,6 +57,7 @@ def test_router_timeouts():
|
||||||
redis_password=os.getenv("REDIS_PASSWORD"),
|
redis_password=os.getenv("REDIS_PASSWORD"),
|
||||||
redis_port=int(os.getenv("REDIS_PORT")),
|
redis_port=int(os.getenv("REDIS_PORT")),
|
||||||
timeout=10,
|
timeout=10,
|
||||||
|
num_retries=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("***** TPM SETTINGS *****")
|
print("***** TPM SETTINGS *****")
|
||||||
|
@ -89,15 +90,15 @@ def test_router_timeouts():
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_router_timeouts_bedrock():
|
async def test_router_timeouts_bedrock():
|
||||||
import openai
|
import openai, uuid
|
||||||
|
|
||||||
# Model list for OpenAI and Anthropic models
|
# Model list for OpenAI and Anthropic models
|
||||||
model_list = [
|
_model_list = [
|
||||||
{
|
{
|
||||||
"model_name": "bedrock",
|
"model_name": "bedrock",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "bedrock/anthropic.claude-instant-v1",
|
"model": "bedrock/anthropic.claude-instant-v1",
|
||||||
"timeout": 0.001,
|
"timeout": 0.00001,
|
||||||
},
|
},
|
||||||
"tpm": 80000,
|
"tpm": 80000,
|
||||||
},
|
},
|
||||||
|
@ -105,17 +106,18 @@ async def test_router_timeouts_bedrock():
|
||||||
|
|
||||||
# Configure router
|
# Configure router
|
||||||
router = Router(
|
router = Router(
|
||||||
model_list=model_list,
|
model_list=_model_list,
|
||||||
routing_strategy="usage-based-routing",
|
routing_strategy="usage-based-routing",
|
||||||
debug_level="DEBUG",
|
debug_level="DEBUG",
|
||||||
set_verbose=True,
|
set_verbose=True,
|
||||||
|
num_retries=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
try:
|
try:
|
||||||
response = await router.acompletion(
|
response = await router.acompletion(
|
||||||
model="bedrock",
|
model="bedrock",
|
||||||
messages=[{"role": "user", "content": "hello, who are u"}],
|
messages=[{"role": "user", "content": f"hello, who are u {uuid.uuid4()}"}],
|
||||||
)
|
)
|
||||||
print(response)
|
print(response)
|
||||||
pytest.fail("Did not raise error `openai.APITimeoutError`")
|
pytest.fail("Did not raise error `openai.APITimeoutError`")
|
||||||
|
|
|
@ -518,7 +518,7 @@ async def test_acompletion_gemini_stream():
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
print("Streaming gemini response")
|
print("Streaming gemini response")
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
# {"role": "system", "content": "You are a helpful assistant."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What do you know?",
|
"content": "What do you know?",
|
||||||
|
@ -1271,6 +1271,33 @@ def test_completion_sagemaker_stream():
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_watsonx_stream():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
try:
|
||||||
|
response = completion(
|
||||||
|
model="watsonx/ibm/granite-13b-chat-v2",
|
||||||
|
messages=messages,
|
||||||
|
temperature=0.5,
|
||||||
|
max_tokens=20,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
complete_response = ""
|
||||||
|
has_finish_reason = False
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
for idx, chunk in enumerate(response):
|
||||||
|
chunk, finished = streaming_format_tests(idx, chunk)
|
||||||
|
has_finish_reason = finished
|
||||||
|
if finished:
|
||||||
|
break
|
||||||
|
complete_response += chunk
|
||||||
|
if has_finish_reason is False:
|
||||||
|
raise Exception("finish reason not set for last chunk")
|
||||||
|
if complete_response.strip() == "":
|
||||||
|
raise Exception("Empty response received")
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
# test_completion_sagemaker_stream()
|
# test_completion_sagemaker_stream()
|
||||||
|
|
||||||
|
|
||||||
|
@ -2446,6 +2473,34 @@ class ModelResponseIterator:
|
||||||
return self.model_response
|
return self.model_response
|
||||||
|
|
||||||
|
|
||||||
|
class ModelResponseListIterator:
|
||||||
|
def __init__(self, model_responses):
|
||||||
|
self.model_responses = model_responses
|
||||||
|
self.index = 0
|
||||||
|
|
||||||
|
# Sync iterator
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
if self.index >= len(self.model_responses):
|
||||||
|
raise StopIteration
|
||||||
|
model_response = self.model_responses[self.index]
|
||||||
|
self.index += 1
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
# Async iterator
|
||||||
|
def __aiter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __anext__(self):
|
||||||
|
if self.index >= len(self.model_responses):
|
||||||
|
raise StopAsyncIteration
|
||||||
|
model_response = self.model_responses[self.index]
|
||||||
|
self.index += 1
|
||||||
|
return model_response
|
||||||
|
|
||||||
|
|
||||||
def test_unit_test_custom_stream_wrapper():
|
def test_unit_test_custom_stream_wrapper():
|
||||||
"""
|
"""
|
||||||
Test if last streaming chunk ends with '?', if the message repeats itself.
|
Test if last streaming chunk ends with '?', if the message repeats itself.
|
||||||
|
@ -2486,3 +2541,268 @@ def test_unit_test_custom_stream_wrapper():
|
||||||
if "How are you?" in chunk.choices[0].delta.content:
|
if "How are you?" in chunk.choices[0].delta.content:
|
||||||
freq += 1
|
freq += 1
|
||||||
assert freq == 1
|
assert freq == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_aamazing_unit_test_custom_stream_wrapper_n():
|
||||||
|
"""
|
||||||
|
Test if the translated output maps exactly to the received openai input
|
||||||
|
|
||||||
|
Relevant issue: https://github.com/BerriAI/litellm/issues/3276
|
||||||
|
"""
|
||||||
|
chunks = [
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"content": "It"},
|
||||||
|
"logprobs": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"token": "It",
|
||||||
|
"logprob": -1.5952516,
|
||||||
|
"bytes": [73, 116],
|
||||||
|
"top_logprobs": [
|
||||||
|
{
|
||||||
|
"token": "Brown",
|
||||||
|
"logprob": -0.7358765,
|
||||||
|
"bytes": [66, 114, 111, 119, 110],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"delta": {"content": "Brown"},
|
||||||
|
"logprobs": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"token": "Brown",
|
||||||
|
"logprob": -0.7358765,
|
||||||
|
"bytes": [66, 114, 111, 119, 110],
|
||||||
|
"top_logprobs": [
|
||||||
|
{
|
||||||
|
"token": "Brown",
|
||||||
|
"logprob": -0.7358765,
|
||||||
|
"bytes": [66, 114, 111, 119, 110],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"content": "'s"},
|
||||||
|
"logprobs": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"token": "'s",
|
||||||
|
"logprob": -0.006786893,
|
||||||
|
"bytes": [39, 115],
|
||||||
|
"top_logprobs": [
|
||||||
|
{
|
||||||
|
"token": "'s",
|
||||||
|
"logprob": -0.006786893,
|
||||||
|
"bytes": [39, 115],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"content": " impossible"},
|
||||||
|
"logprobs": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"token": " impossible",
|
||||||
|
"logprob": -0.06528423,
|
||||||
|
"bytes": [
|
||||||
|
32,
|
||||||
|
105,
|
||||||
|
109,
|
||||||
|
112,
|
||||||
|
111,
|
||||||
|
115,
|
||||||
|
115,
|
||||||
|
105,
|
||||||
|
98,
|
||||||
|
108,
|
||||||
|
101,
|
||||||
|
],
|
||||||
|
"top_logprobs": [
|
||||||
|
{
|
||||||
|
"token": " impossible",
|
||||||
|
"logprob": -0.06528423,
|
||||||
|
"bytes": [
|
||||||
|
32,
|
||||||
|
105,
|
||||||
|
109,
|
||||||
|
112,
|
||||||
|
111,
|
||||||
|
115,
|
||||||
|
115,
|
||||||
|
105,
|
||||||
|
98,
|
||||||
|
108,
|
||||||
|
101,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"content": "—even"},
|
||||||
|
"logprobs": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"token": "—even",
|
||||||
|
"logprob": -9999.0,
|
||||||
|
"bytes": [226, 128, 148, 101, 118, 101, 110],
|
||||||
|
"top_logprobs": [
|
||||||
|
{
|
||||||
|
"token": " to",
|
||||||
|
"logprob": -0.12302828,
|
||||||
|
"bytes": [32, 116, 111],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": 1714075272,
|
||||||
|
"model": "gpt-4-0613",
|
||||||
|
"system_fingerprint": None,
|
||||||
|
"choices": [
|
||||||
|
{"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
chunk_list = []
|
||||||
|
for chunk in chunks:
|
||||||
|
new_chunk = litellm.ModelResponse(stream=True, id=chunk["id"])
|
||||||
|
if "choices" in chunk and isinstance(chunk["choices"], list):
|
||||||
|
print("INSIDE CHUNK CHOICES!")
|
||||||
|
new_choices = []
|
||||||
|
for choice in chunk["choices"]:
|
||||||
|
if isinstance(choice, litellm.utils.StreamingChoices):
|
||||||
|
_new_choice = choice
|
||||||
|
elif isinstance(choice, dict):
|
||||||
|
_new_choice = litellm.utils.StreamingChoices(**choice)
|
||||||
|
new_choices.append(_new_choice)
|
||||||
|
new_chunk.choices = new_choices
|
||||||
|
chunk_list.append(new_chunk)
|
||||||
|
|
||||||
|
completion_stream = ModelResponseListIterator(model_responses=chunk_list)
|
||||||
|
|
||||||
|
response = litellm.CustomStreamWrapper(
|
||||||
|
completion_stream=completion_stream,
|
||||||
|
model="gpt-4-0613",
|
||||||
|
custom_llm_provider="cached_response",
|
||||||
|
logging_obj=litellm.Logging(
|
||||||
|
model="gpt-4-0613",
|
||||||
|
messages=[{"role": "user", "content": "Hey"}],
|
||||||
|
stream=True,
|
||||||
|
call_type="completion",
|
||||||
|
start_time=time.time(),
|
||||||
|
litellm_call_id="12345",
|
||||||
|
function_id="1245",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
for idx, chunk in enumerate(response):
|
||||||
|
chunk_dict = {}
|
||||||
|
try:
|
||||||
|
chunk_dict = chunk.model_dump(exclude_none=True)
|
||||||
|
except:
|
||||||
|
chunk_dict = chunk.dict(exclude_none=True)
|
||||||
|
|
||||||
|
chunk_dict.pop("created")
|
||||||
|
chunks[idx].pop("created")
|
||||||
|
if chunks[idx]["system_fingerprint"] is None:
|
||||||
|
chunks[idx].pop("system_fingerprint", None)
|
||||||
|
if idx == 0:
|
||||||
|
for choice in chunk_dict["choices"]:
|
||||||
|
if "role" in choice["delta"]:
|
||||||
|
choice["delta"].pop("role")
|
||||||
|
|
||||||
|
for choice in chunks[idx]["choices"]:
|
||||||
|
# ignore finish reason None - since our pydantic object is set to exclude_none = true
|
||||||
|
if "finish_reason" in choice and choice["finish_reason"] is None:
|
||||||
|
choice.pop("finish_reason")
|
||||||
|
if "logprobs" in choice and choice["logprobs"] is None:
|
||||||
|
choice.pop("logprobs")
|
||||||
|
|
||||||
|
assert (
|
||||||
|
chunk_dict == chunks[idx]
|
||||||
|
), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"
|
||||||
|
|
|
@ -78,7 +78,8 @@ def test_hanging_request_azure():
|
||||||
"model_name": "openai-gpt",
|
"model_name": "openai-gpt",
|
||||||
"litellm_params": {"model": "gpt-3.5-turbo"},
|
"litellm_params": {"model": "gpt-3.5-turbo"},
|
||||||
},
|
},
|
||||||
]
|
],
|
||||||
|
num_retries=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
|
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
|
||||||
|
@ -131,7 +132,8 @@ def test_hanging_request_openai():
|
||||||
"model_name": "openai-gpt",
|
"model_name": "openai-gpt",
|
||||||
"litellm_params": {"model": "gpt-3.5-turbo"},
|
"litellm_params": {"model": "gpt-3.5-turbo"},
|
||||||
},
|
},
|
||||||
]
|
],
|
||||||
|
num_retries=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
|
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
|
||||||
|
@ -189,6 +191,7 @@ def test_timeout_streaming():
|
||||||
# test_timeout_streaming()
|
# test_timeout_streaming()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="local test")
|
||||||
def test_timeout_ollama():
|
def test_timeout_ollama():
|
||||||
# this Will Raise a timeout
|
# this Will Raise a timeout
|
||||||
import litellm
|
import litellm
|
||||||
|
|
|
@ -282,6 +282,64 @@ def test_router_skip_rate_limited_deployments():
|
||||||
print(f"An exception occurred! {str(e)}")
|
print(f"An exception occurred! {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_multiple_potential_deployments(sync_mode):
|
||||||
|
"""
|
||||||
|
If multiple deployments have the same tpm value
|
||||||
|
|
||||||
|
call 5 times, test if deployments are shuffled.
|
||||||
|
|
||||||
|
-> prevents single deployment from being overloaded in high-concurrency scenario
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-turbo",
|
||||||
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
|
"tpm": 1440,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "azure-model",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/gpt-turbo-2",
|
||||||
|
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
|
||||||
|
"api_base": "https://openai-france-1234.openai.azure.com",
|
||||||
|
"tpm": 1440,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
routing_strategy="usage-based-routing-v2",
|
||||||
|
set_verbose=False,
|
||||||
|
num_retries=3,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
model_ids = set()
|
||||||
|
for _ in range(1000):
|
||||||
|
if sync_mode:
|
||||||
|
deployment = router.get_available_deployment(
|
||||||
|
model="azure-model",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
deployment = await router.async_get_available_deployment(
|
||||||
|
model="azure-model",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
## get id ##
|
||||||
|
id = deployment.get("model_info", {}).get("id")
|
||||||
|
model_ids.add(id)
|
||||||
|
|
||||||
|
assert len(model_ids) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_single_deployment_tpm_zero():
|
def test_single_deployment_tpm_zero():
|
||||||
import litellm
|
import litellm
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Optional, Union, Dict, Tuple, Literal
|
from typing import List, Optional, Union, Dict, Tuple, Literal
|
||||||
|
import httpx
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
from .completion import CompletionRequest
|
from .completion import CompletionRequest
|
||||||
from .embedding import EmbeddingRequest
|
from .embedding import EmbeddingRequest
|
||||||
|
@ -104,11 +104,13 @@ class LiteLLM_Params(BaseModel):
|
||||||
api_key: Optional[str] = None
|
api_key: Optional[str] = None
|
||||||
api_base: Optional[str] = None
|
api_base: Optional[str] = None
|
||||||
api_version: Optional[str] = None
|
api_version: Optional[str] = None
|
||||||
timeout: Optional[Union[float, str]] = None # if str, pass in as os.environ/
|
timeout: Optional[Union[float, str, httpx.Timeout]] = (
|
||||||
|
None # if str, pass in as os.environ/
|
||||||
|
)
|
||||||
stream_timeout: Optional[Union[float, str]] = (
|
stream_timeout: Optional[Union[float, str]] = (
|
||||||
None # timeout when making stream=True calls, if str, pass in as os.environ/
|
None # timeout when making stream=True calls, if str, pass in as os.environ/
|
||||||
)
|
)
|
||||||
max_retries: int = 2 # follows openai default of 2
|
max_retries: Optional[int] = None
|
||||||
organization: Optional[str] = None # for openai orgs
|
organization: Optional[str] = None # for openai orgs
|
||||||
## VERTEX AI ##
|
## VERTEX AI ##
|
||||||
vertex_project: Optional[str] = None
|
vertex_project: Optional[str] = None
|
||||||
|
@ -146,14 +148,13 @@ class LiteLLM_Params(BaseModel):
|
||||||
args.pop("self", None)
|
args.pop("self", None)
|
||||||
args.pop("params", None)
|
args.pop("params", None)
|
||||||
args.pop("__class__", None)
|
args.pop("__class__", None)
|
||||||
if max_retries is None:
|
if max_retries is not None and isinstance(max_retries, str):
|
||||||
max_retries = 2
|
|
||||||
elif isinstance(max_retries, str):
|
|
||||||
max_retries = int(max_retries) # cast to int
|
max_retries = int(max_retries) # cast to int
|
||||||
super().__init__(max_retries=max_retries, **args, **params)
|
super().__init__(max_retries=max_retries, **args, **params)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
# Define custom behavior for the 'in' operator
|
# Define custom behavior for the 'in' operator
|
||||||
|
@ -201,6 +202,9 @@ class updateDeployment(BaseModel):
|
||||||
litellm_params: Optional[updateLiteLLMParams] = None
|
litellm_params: Optional[updateLiteLLMParams] = None
|
||||||
model_info: Optional[ModelInfo] = None
|
model_info: Optional[ModelInfo] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
protected_namespaces = ()
|
||||||
|
|
||||||
|
|
||||||
class Deployment(BaseModel):
|
class Deployment(BaseModel):
|
||||||
model_name: str
|
model_name: str
|
||||||
|
@ -259,3 +263,4 @@ class RouterErrors(enum.Enum):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
|
user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
|
||||||
|
no_deployments_available = "No deployments available for selected model"
|
||||||
|
|
478
litellm/utils.py
478
litellm/utils.py
|
@ -19,6 +19,7 @@ from functools import wraps
|
||||||
import datetime, time
|
import datetime, time
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import uuid
|
import uuid
|
||||||
|
from pydantic import BaseModel
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import textwrap
|
import textwrap
|
||||||
import logging
|
import logging
|
||||||
|
@ -69,6 +70,7 @@ from .integrations.langsmith import LangsmithLogger
|
||||||
from .integrations.weights_biases import WeightsBiasesLogger
|
from .integrations.weights_biases import WeightsBiasesLogger
|
||||||
from .integrations.custom_logger import CustomLogger
|
from .integrations.custom_logger import CustomLogger
|
||||||
from .integrations.langfuse import LangFuseLogger
|
from .integrations.langfuse import LangFuseLogger
|
||||||
|
from .integrations.openmeter import OpenMeterLogger
|
||||||
from .integrations.datadog import DataDogLogger
|
from .integrations.datadog import DataDogLogger
|
||||||
from .integrations.prometheus import PrometheusLogger
|
from .integrations.prometheus import PrometheusLogger
|
||||||
from .integrations.prometheus_services import PrometheusServicesLogger
|
from .integrations.prometheus_services import PrometheusServicesLogger
|
||||||
|
@ -105,7 +107,7 @@ try:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.debug(f"Exception import enterprise features {str(e)}")
|
verbose_logger.debug(f"Exception import enterprise features {str(e)}")
|
||||||
|
|
||||||
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
|
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO, Iterable
|
||||||
from .caching import Cache
|
from .caching import Cache
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
@ -129,6 +131,7 @@ langsmithLogger = None
|
||||||
weightsBiasesLogger = None
|
weightsBiasesLogger = None
|
||||||
customLogger = None
|
customLogger = None
|
||||||
langFuseLogger = None
|
langFuseLogger = None
|
||||||
|
openMeterLogger = None
|
||||||
dataDogLogger = None
|
dataDogLogger = None
|
||||||
prometheusLogger = None
|
prometheusLogger = None
|
||||||
dynamoLogger = None
|
dynamoLogger = None
|
||||||
|
@ -219,6 +222,61 @@ def map_finish_reason(
|
||||||
return finish_reason
|
return finish_reason
|
||||||
|
|
||||||
|
|
||||||
|
class TopLogprob(OpenAIObject):
|
||||||
|
token: str
|
||||||
|
"""The token."""
|
||||||
|
|
||||||
|
bytes: Optional[List[int]] = None
|
||||||
|
"""A list of integers representing the UTF-8 bytes representation of the token.
|
||||||
|
|
||||||
|
Useful in instances where characters are represented by multiple tokens and
|
||||||
|
their byte representations must be combined to generate the correct text
|
||||||
|
representation. Can be `null` if there is no bytes representation for the token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logprob: float
|
||||||
|
"""The log probability of this token, if it is within the top 20 most likely
|
||||||
|
tokens.
|
||||||
|
|
||||||
|
Otherwise, the value `-9999.0` is used to signify that the token is very
|
||||||
|
unlikely.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionTokenLogprob(OpenAIObject):
|
||||||
|
token: str
|
||||||
|
"""The token."""
|
||||||
|
|
||||||
|
bytes: Optional[List[int]] = None
|
||||||
|
"""A list of integers representing the UTF-8 bytes representation of the token.
|
||||||
|
|
||||||
|
Useful in instances where characters are represented by multiple tokens and
|
||||||
|
their byte representations must be combined to generate the correct text
|
||||||
|
representation. Can be `null` if there is no bytes representation for the token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logprob: float
|
||||||
|
"""The log probability of this token, if it is within the top 20 most likely
|
||||||
|
tokens.
|
||||||
|
|
||||||
|
Otherwise, the value `-9999.0` is used to signify that the token is very
|
||||||
|
unlikely.
|
||||||
|
"""
|
||||||
|
|
||||||
|
top_logprobs: List[TopLogprob]
|
||||||
|
"""List of the most likely tokens and their log probability, at this token
|
||||||
|
position.
|
||||||
|
|
||||||
|
In rare cases, there may be fewer than the number of requested `top_logprobs`
|
||||||
|
returned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class ChoiceLogprobs(OpenAIObject):
|
||||||
|
content: Optional[List[ChatCompletionTokenLogprob]] = None
|
||||||
|
"""A list of message content tokens with log probability information."""
|
||||||
|
|
||||||
|
|
||||||
class FunctionCall(OpenAIObject):
|
class FunctionCall(OpenAIObject):
|
||||||
arguments: str
|
arguments: str
|
||||||
name: Optional[str] = None
|
name: Optional[str] = None
|
||||||
|
@ -320,19 +378,19 @@ class Message(OpenAIObject):
|
||||||
super(Message, self).__init__(**params)
|
super(Message, self).__init__(**params)
|
||||||
self.content = content
|
self.content = content
|
||||||
self.role = role
|
self.role = role
|
||||||
|
self.tool_calls = None
|
||||||
|
self.function_call = None
|
||||||
|
|
||||||
if function_call is not None:
|
if function_call is not None:
|
||||||
self.function_call = FunctionCall(**function_call)
|
self.function_call = FunctionCall(**function_call)
|
||||||
|
|
||||||
if tool_calls is not None:
|
if tool_calls is not None:
|
||||||
self.tool_calls = []
|
self.tool_calls = [
|
||||||
for tool_call in tool_calls:
|
ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
|
||||||
if isinstance(tool_call, dict):
|
]
|
||||||
self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
|
|
||||||
else:
|
|
||||||
self.tool_calls.append(tool_call)
|
|
||||||
|
|
||||||
if logprobs is not None:
|
if logprobs is not None:
|
||||||
self._logprobs = logprobs
|
self._logprobs = ChoiceLogprobs(**logprobs)
|
||||||
|
|
||||||
def get(self, key, default=None):
|
def get(self, key, default=None):
|
||||||
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
|
||||||
|
@ -355,12 +413,20 @@ class Message(OpenAIObject):
|
||||||
|
|
||||||
|
|
||||||
class Delta(OpenAIObject):
|
class Delta(OpenAIObject):
|
||||||
|
tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, content=None, role=None, function_call=None, tool_calls=None, **params
|
self,
|
||||||
|
content=None,
|
||||||
|
role=None,
|
||||||
|
function_call=None,
|
||||||
|
tool_calls=None,
|
||||||
|
**params,
|
||||||
):
|
):
|
||||||
super(Delta, self).__init__(**params)
|
super(Delta, self).__init__(**params)
|
||||||
self.content = content
|
self.content = content
|
||||||
self.role = role
|
self.role = role
|
||||||
|
|
||||||
if function_call is not None and isinstance(function_call, dict):
|
if function_call is not None and isinstance(function_call, dict):
|
||||||
self.function_call = FunctionCall(**function_call)
|
self.function_call = FunctionCall(**function_call)
|
||||||
else:
|
else:
|
||||||
|
@ -410,7 +476,7 @@ class Choices(OpenAIObject):
|
||||||
) # set finish_reason for all responses
|
) # set finish_reason for all responses
|
||||||
self.index = index
|
self.index = index
|
||||||
if message is None:
|
if message is None:
|
||||||
self.message = Message(content=None)
|
self.message = Message()
|
||||||
else:
|
else:
|
||||||
if isinstance(message, Message):
|
if isinstance(message, Message):
|
||||||
self.message = message
|
self.message = message
|
||||||
|
@ -492,7 +558,11 @@ class StreamingChoices(OpenAIObject):
|
||||||
self.delta = Delta()
|
self.delta = Delta()
|
||||||
if enhancements is not None:
|
if enhancements is not None:
|
||||||
self.enhancements = enhancements
|
self.enhancements = enhancements
|
||||||
self.logprobs = logprobs
|
|
||||||
|
if logprobs is not None and isinstance(logprobs, dict):
|
||||||
|
self.logprobs = ChoiceLogprobs(**logprobs)
|
||||||
|
else:
|
||||||
|
self.logprobs = logprobs # type: ignore
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
# Define custom behavior for the 'in' operator
|
# Define custom behavior for the 'in' operator
|
||||||
|
@ -1139,6 +1209,13 @@ class Logging:
|
||||||
if verbose_logger.level == 0:
|
if verbose_logger.level == 0:
|
||||||
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
|
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
|
||||||
print_verbose(f"\033[92m{curl_command}\033[0m\n")
|
print_verbose(f"\033[92m{curl_command}\033[0m\n")
|
||||||
|
|
||||||
|
if litellm.json_logs:
|
||||||
|
verbose_logger.info(
|
||||||
|
"POST Request Sent from LiteLLM",
|
||||||
|
extra={"api_base": {api_base}, **masked_headers},
|
||||||
|
)
|
||||||
|
else:
|
||||||
verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
|
verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
|
||||||
if self.logger_fn and callable(self.logger_fn):
|
if self.logger_fn and callable(self.logger_fn):
|
||||||
try:
|
try:
|
||||||
|
@ -1149,7 +1226,6 @@ class Logging:
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
||||||
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
|
@ -1166,29 +1242,20 @@ class Logging:
|
||||||
litellm_call_id=self.litellm_params["litellm_call_id"],
|
litellm_call_id=self.litellm_params["litellm_call_id"],
|
||||||
print_verbose=print_verbose,
|
print_verbose=print_verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif callback == "lite_debugger":
|
|
||||||
print_verbose(
|
|
||||||
f"reaches litedebugger for logging! - model_call_details {self.model_call_details}"
|
|
||||||
)
|
|
||||||
model = self.model_call_details["model"]
|
|
||||||
messages = self.model_call_details["input"]
|
|
||||||
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
|
||||||
liteDebuggerClient.input_log_event(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
end_user=self.model_call_details.get("user", "default"),
|
|
||||||
litellm_call_id=self.litellm_params["litellm_call_id"],
|
|
||||||
litellm_params=self.model_call_details["litellm_params"],
|
|
||||||
optional_params=self.model_call_details["optional_params"],
|
|
||||||
print_verbose=print_verbose,
|
|
||||||
call_type=self.call_type,
|
|
||||||
)
|
|
||||||
elif callback == "sentry" and add_breadcrumb:
|
elif callback == "sentry" and add_breadcrumb:
|
||||||
print_verbose("reaches sentry breadcrumbing")
|
try:
|
||||||
|
details_to_log = copy.deepcopy(self.model_call_details)
|
||||||
|
except:
|
||||||
|
details_to_log = self.model_call_details
|
||||||
|
if litellm.turn_off_message_logging:
|
||||||
|
# make a copy of the _model_Call_details and log it
|
||||||
|
details_to_log.pop("messages", None)
|
||||||
|
details_to_log.pop("input", None)
|
||||||
|
details_to_log.pop("prompt", None)
|
||||||
|
|
||||||
add_breadcrumb(
|
add_breadcrumb(
|
||||||
category="litellm.llm_call",
|
category="litellm.llm_call",
|
||||||
message=f"Model Call Details pre-call: {self.model_call_details}",
|
message=f"Model Call Details pre-call: {details_to_log}",
|
||||||
level="info",
|
level="info",
|
||||||
)
|
)
|
||||||
elif isinstance(callback, CustomLogger): # custom logger class
|
elif isinstance(callback, CustomLogger): # custom logger class
|
||||||
|
@ -1252,7 +1319,7 @@ class Logging:
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
|
self.redact_message_input_output_from_logging(result=original_response)
|
||||||
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
||||||
|
|
||||||
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
||||||
|
@ -1270,9 +1337,19 @@ class Logging:
|
||||||
)
|
)
|
||||||
elif callback == "sentry" and add_breadcrumb:
|
elif callback == "sentry" and add_breadcrumb:
|
||||||
print_verbose("reaches sentry breadcrumbing")
|
print_verbose("reaches sentry breadcrumbing")
|
||||||
|
try:
|
||||||
|
details_to_log = copy.deepcopy(self.model_call_details)
|
||||||
|
except:
|
||||||
|
details_to_log = self.model_call_details
|
||||||
|
if litellm.turn_off_message_logging:
|
||||||
|
# make a copy of the _model_Call_details and log it
|
||||||
|
details_to_log.pop("messages", None)
|
||||||
|
details_to_log.pop("input", None)
|
||||||
|
details_to_log.pop("prompt", None)
|
||||||
|
|
||||||
add_breadcrumb(
|
add_breadcrumb(
|
||||||
category="litellm.llm_call",
|
category="litellm.llm_call",
|
||||||
message=f"Model Call Details post-call: {self.model_call_details}",
|
message=f"Model Call Details post-call: {details_to_log}",
|
||||||
level="info",
|
level="info",
|
||||||
)
|
)
|
||||||
elif isinstance(callback, CustomLogger): # custom logger class
|
elif isinstance(callback, CustomLogger): # custom logger class
|
||||||
|
@ -1464,6 +1541,8 @@ class Logging:
|
||||||
else:
|
else:
|
||||||
callbacks = litellm.success_callback
|
callbacks = litellm.success_callback
|
||||||
|
|
||||||
|
self.redact_message_input_output_from_logging(result=result)
|
||||||
|
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
try:
|
try:
|
||||||
litellm_params = self.model_call_details.get("litellm_params", {})
|
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||||
|
@ -1850,6 +1929,51 @@ class Logging:
|
||||||
end_time=end_time,
|
end_time=end_time,
|
||||||
print_verbose=print_verbose,
|
print_verbose=print_verbose,
|
||||||
)
|
)
|
||||||
|
if (
|
||||||
|
callback == "openmeter"
|
||||||
|
and self.model_call_details.get("litellm_params", {}).get(
|
||||||
|
"acompletion", False
|
||||||
|
)
|
||||||
|
== False
|
||||||
|
and self.model_call_details.get("litellm_params", {}).get(
|
||||||
|
"aembedding", False
|
||||||
|
)
|
||||||
|
== False
|
||||||
|
and self.model_call_details.get("litellm_params", {}).get(
|
||||||
|
"aimage_generation", False
|
||||||
|
)
|
||||||
|
== False
|
||||||
|
and self.model_call_details.get("litellm_params", {}).get(
|
||||||
|
"atranscription", False
|
||||||
|
)
|
||||||
|
== False
|
||||||
|
):
|
||||||
|
global openMeterLogger
|
||||||
|
if openMeterLogger is None:
|
||||||
|
print_verbose("Instantiates openmeter client")
|
||||||
|
openMeterLogger = OpenMeterLogger()
|
||||||
|
if self.stream and complete_streaming_response is None:
|
||||||
|
openMeterLogger.log_stream_event(
|
||||||
|
kwargs=self.model_call_details,
|
||||||
|
response_obj=result,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if self.stream and complete_streaming_response:
|
||||||
|
self.model_call_details["complete_response"] = (
|
||||||
|
self.model_call_details.get(
|
||||||
|
"complete_streaming_response", {}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result = self.model_call_details["complete_response"]
|
||||||
|
openMeterLogger.log_success_event(
|
||||||
|
kwargs=self.model_call_details,
|
||||||
|
response_obj=result,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
isinstance(callback, CustomLogger)
|
isinstance(callback, CustomLogger)
|
||||||
and self.model_call_details.get("litellm_params", {}).get(
|
and self.model_call_details.get("litellm_params", {}).get(
|
||||||
|
@ -2008,7 +2132,9 @@ class Logging:
|
||||||
callbacks.append(callback)
|
callbacks.append(callback)
|
||||||
else:
|
else:
|
||||||
callbacks = litellm._async_success_callback
|
callbacks = litellm._async_success_callback
|
||||||
print_verbose(f"Async success callbacks: {callbacks}")
|
|
||||||
|
self.redact_message_input_output_from_logging(result=result)
|
||||||
|
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
# check if callback can run for this request
|
# check if callback can run for this request
|
||||||
litellm_params = self.model_call_details.get("litellm_params", {})
|
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||||
|
@ -2046,6 +2172,35 @@ class Logging:
|
||||||
await litellm.cache.async_add_cache(result, **kwargs)
|
await litellm.cache.async_add_cache(result, **kwargs)
|
||||||
else:
|
else:
|
||||||
litellm.cache.add_cache(result, **kwargs)
|
litellm.cache.add_cache(result, **kwargs)
|
||||||
|
if callback == "openmeter":
|
||||||
|
global openMeterLogger
|
||||||
|
if self.stream == True:
|
||||||
|
if (
|
||||||
|
"async_complete_streaming_response"
|
||||||
|
in self.model_call_details
|
||||||
|
):
|
||||||
|
await openMeterLogger.async_log_success_event(
|
||||||
|
kwargs=self.model_call_details,
|
||||||
|
response_obj=self.model_call_details[
|
||||||
|
"async_complete_streaming_response"
|
||||||
|
],
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await openMeterLogger.async_log_stream_event( # [TODO]: move this to being an async log stream event function
|
||||||
|
kwargs=self.model_call_details,
|
||||||
|
response_obj=result,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await openMeterLogger.async_log_success_event(
|
||||||
|
kwargs=self.model_call_details,
|
||||||
|
response_obj=result,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
if isinstance(callback, CustomLogger): # custom logger class
|
if isinstance(callback, CustomLogger): # custom logger class
|
||||||
if self.stream == True:
|
if self.stream == True:
|
||||||
if (
|
if (
|
||||||
|
@ -2169,7 +2324,10 @@ class Logging:
|
||||||
start_time=start_time,
|
start_time=start_time,
|
||||||
end_time=end_time,
|
end_time=end_time,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = None # result sent to all loggers, init this to None incase it's not created
|
result = None # result sent to all loggers, init this to None incase it's not created
|
||||||
|
|
||||||
|
self.redact_message_input_output_from_logging(result=result)
|
||||||
for callback in litellm.failure_callback:
|
for callback in litellm.failure_callback:
|
||||||
try:
|
try:
|
||||||
if callback == "lite_debugger":
|
if callback == "lite_debugger":
|
||||||
|
@ -2354,6 +2512,39 @@ class Logging:
|
||||||
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def redact_message_input_output_from_logging(self, result):
|
||||||
|
"""
|
||||||
|
Removes messages, prompts, input, response from logging. This modifies the data in-place
|
||||||
|
only redacts when litellm.turn_off_message_logging == True
|
||||||
|
"""
|
||||||
|
# check if user opted out of logging message/response to callbacks
|
||||||
|
if litellm.turn_off_message_logging == True:
|
||||||
|
# remove messages, prompts, input, response from logging
|
||||||
|
self.model_call_details["messages"] = "redacted-by-litellm"
|
||||||
|
self.model_call_details["prompt"] = ""
|
||||||
|
self.model_call_details["input"] = ""
|
||||||
|
|
||||||
|
# response cleaning
|
||||||
|
# ChatCompletion Responses
|
||||||
|
if self.stream and "complete_streaming_response" in self.model_call_details:
|
||||||
|
_streaming_response = self.model_call_details[
|
||||||
|
"complete_streaming_response"
|
||||||
|
]
|
||||||
|
for choice in _streaming_response.choices:
|
||||||
|
if isinstance(choice, litellm.Choices):
|
||||||
|
choice.message.content = "redacted-by-litellm"
|
||||||
|
elif isinstance(choice, litellm.utils.StreamingChoices):
|
||||||
|
choice.delta.content = "redacted-by-litellm"
|
||||||
|
else:
|
||||||
|
if result is not None:
|
||||||
|
if isinstance(result, litellm.ModelResponse):
|
||||||
|
if hasattr(result, "choices") and result.choices is not None:
|
||||||
|
for choice in result.choices:
|
||||||
|
if isinstance(choice, litellm.Choices):
|
||||||
|
choice.message.content = "redacted-by-litellm"
|
||||||
|
elif isinstance(choice, litellm.utils.StreamingChoices):
|
||||||
|
choice.delta.content = "redacted-by-litellm"
|
||||||
|
|
||||||
|
|
||||||
def exception_logging(
|
def exception_logging(
|
||||||
additional_args={},
|
additional_args={},
|
||||||
|
@ -2436,7 +2627,7 @@ class Rules:
|
||||||
####### CLIENT ###################
|
####### CLIENT ###################
|
||||||
# make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
|
# make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
|
||||||
def function_setup(
|
def function_setup(
|
||||||
original_function, rules_obj, start_time, *args, **kwargs
|
original_function: str, rules_obj, start_time, *args, **kwargs
|
||||||
): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
|
): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
|
||||||
try:
|
try:
|
||||||
global callback_list, add_breadcrumb, user_logger_fn, Logging
|
global callback_list, add_breadcrumb, user_logger_fn, Logging
|
||||||
|
@ -2460,10 +2651,12 @@ def function_setup(
|
||||||
len(litellm.input_callback) > 0
|
len(litellm.input_callback) > 0
|
||||||
or len(litellm.success_callback) > 0
|
or len(litellm.success_callback) > 0
|
||||||
or len(litellm.failure_callback) > 0
|
or len(litellm.failure_callback) > 0
|
||||||
) and len(callback_list) == 0:
|
) and len(
|
||||||
|
callback_list # type: ignore
|
||||||
|
) == 0: # type: ignore
|
||||||
callback_list = list(
|
callback_list = list(
|
||||||
set(
|
set(
|
||||||
litellm.input_callback
|
litellm.input_callback # type: ignore
|
||||||
+ litellm.success_callback
|
+ litellm.success_callback
|
||||||
+ litellm.failure_callback
|
+ litellm.failure_callback
|
||||||
)
|
)
|
||||||
|
@ -2472,7 +2665,7 @@ def function_setup(
|
||||||
## ASYNC CALLBACKS
|
## ASYNC CALLBACKS
|
||||||
if len(litellm.input_callback) > 0:
|
if len(litellm.input_callback) > 0:
|
||||||
removed_async_items = []
|
removed_async_items = []
|
||||||
for index, callback in enumerate(litellm.input_callback):
|
for index, callback in enumerate(litellm.input_callback): # type: ignore
|
||||||
if inspect.iscoroutinefunction(callback):
|
if inspect.iscoroutinefunction(callback):
|
||||||
litellm._async_input_callback.append(callback)
|
litellm._async_input_callback.append(callback)
|
||||||
removed_async_items.append(index)
|
removed_async_items.append(index)
|
||||||
|
@ -2483,11 +2676,11 @@ def function_setup(
|
||||||
|
|
||||||
if len(litellm.success_callback) > 0:
|
if len(litellm.success_callback) > 0:
|
||||||
removed_async_items = []
|
removed_async_items = []
|
||||||
for index, callback in enumerate(litellm.success_callback):
|
for index, callback in enumerate(litellm.success_callback): # type: ignore
|
||||||
if inspect.iscoroutinefunction(callback):
|
if inspect.iscoroutinefunction(callback):
|
||||||
litellm._async_success_callback.append(callback)
|
litellm._async_success_callback.append(callback)
|
||||||
removed_async_items.append(index)
|
removed_async_items.append(index)
|
||||||
elif callback == "dynamodb":
|
elif callback == "dynamodb" or callback == "openmeter":
|
||||||
# dynamo is an async callback, it's used for the proxy and needs to be async
|
# dynamo is an async callback, it's used for the proxy and needs to be async
|
||||||
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
|
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
|
||||||
litellm._async_success_callback.append(callback)
|
litellm._async_success_callback.append(callback)
|
||||||
|
@ -2499,7 +2692,7 @@ def function_setup(
|
||||||
|
|
||||||
if len(litellm.failure_callback) > 0:
|
if len(litellm.failure_callback) > 0:
|
||||||
removed_async_items = []
|
removed_async_items = []
|
||||||
for index, callback in enumerate(litellm.failure_callback):
|
for index, callback in enumerate(litellm.failure_callback): # type: ignore
|
||||||
if inspect.iscoroutinefunction(callback):
|
if inspect.iscoroutinefunction(callback):
|
||||||
litellm._async_failure_callback.append(callback)
|
litellm._async_failure_callback.append(callback)
|
||||||
removed_async_items.append(index)
|
removed_async_items.append(index)
|
||||||
|
@ -2533,16 +2726,26 @@ def function_setup(
|
||||||
dynamic_success_callbacks = kwargs.pop("success_callback")
|
dynamic_success_callbacks = kwargs.pop("success_callback")
|
||||||
|
|
||||||
if add_breadcrumb:
|
if add_breadcrumb:
|
||||||
|
try:
|
||||||
|
details_to_log = copy.deepcopy(kwargs)
|
||||||
|
except:
|
||||||
|
details_to_log = kwargs
|
||||||
|
|
||||||
|
if litellm.turn_off_message_logging:
|
||||||
|
# make a copy of the _model_Call_details and log it
|
||||||
|
details_to_log.pop("messages", None)
|
||||||
|
details_to_log.pop("input", None)
|
||||||
|
details_to_log.pop("prompt", None)
|
||||||
add_breadcrumb(
|
add_breadcrumb(
|
||||||
category="litellm.llm_call",
|
category="litellm.llm_call",
|
||||||
message=f"Positional Args: {args}, Keyword Args: {kwargs}",
|
message=f"Positional Args: {args}, Keyword Args: {details_to_log}",
|
||||||
level="info",
|
level="info",
|
||||||
)
|
)
|
||||||
if "logger_fn" in kwargs:
|
if "logger_fn" in kwargs:
|
||||||
user_logger_fn = kwargs["logger_fn"]
|
user_logger_fn = kwargs["logger_fn"]
|
||||||
# INIT LOGGER - for user-specified integrations
|
# INIT LOGGER - for user-specified integrations
|
||||||
model = args[0] if len(args) > 0 else kwargs.get("model", None)
|
model = args[0] if len(args) > 0 else kwargs.get("model", None)
|
||||||
call_type = original_function.__name__
|
call_type = original_function
|
||||||
if (
|
if (
|
||||||
call_type == CallTypes.completion.value
|
call_type == CallTypes.completion.value
|
||||||
or call_type == CallTypes.acompletion.value
|
or call_type == CallTypes.acompletion.value
|
||||||
|
@ -2724,7 +2927,7 @@ def client(original_function):
|
||||||
try:
|
try:
|
||||||
if logging_obj is None:
|
if logging_obj is None:
|
||||||
logging_obj, kwargs = function_setup(
|
logging_obj, kwargs = function_setup(
|
||||||
original_function, rules_obj, start_time, *args, **kwargs
|
original_function.__name__, rules_obj, start_time, *args, **kwargs
|
||||||
)
|
)
|
||||||
kwargs["litellm_logging_obj"] = logging_obj
|
kwargs["litellm_logging_obj"] = logging_obj
|
||||||
|
|
||||||
|
@ -3033,7 +3236,7 @@ def client(original_function):
|
||||||
try:
|
try:
|
||||||
if logging_obj is None:
|
if logging_obj is None:
|
||||||
logging_obj, kwargs = function_setup(
|
logging_obj, kwargs = function_setup(
|
||||||
original_function, rules_obj, start_time, *args, **kwargs
|
original_function.__name__, rules_obj, start_time, *args, **kwargs
|
||||||
)
|
)
|
||||||
kwargs["litellm_logging_obj"] = logging_obj
|
kwargs["litellm_logging_obj"] = logging_obj
|
||||||
|
|
||||||
|
@ -3540,12 +3743,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
|
||||||
a100_80gb_price_per_second_public = (
|
a100_80gb_price_per_second_public = (
|
||||||
0.001400 # assume all calls sent to A100 80GB for now
|
0.001400 # assume all calls sent to A100 80GB for now
|
||||||
)
|
)
|
||||||
if total_time == 0.0:
|
if total_time == 0.0: # total time is in ms
|
||||||
start_time = completion_response["created"]
|
start_time = completion_response["created"]
|
||||||
end_time = completion_response["ended"]
|
end_time = completion_response["ended"]
|
||||||
total_time = end_time - start_time
|
total_time = end_time - start_time
|
||||||
|
|
||||||
return a100_80gb_price_per_second_public * total_time
|
return a100_80gb_price_per_second_public * total_time / 1000
|
||||||
|
|
||||||
|
|
||||||
def _select_tokenizer(model: str):
|
def _select_tokenizer(model: str):
|
||||||
|
@ -3567,7 +3770,7 @@ def _select_tokenizer(model: str):
|
||||||
tokenizer = Tokenizer.from_str(json_str)
|
tokenizer = Tokenizer.from_str(json_str)
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
# llama2
|
# llama2
|
||||||
elif "llama-2" in model.lower():
|
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
||||||
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
# default - tiktoken
|
# default - tiktoken
|
||||||
|
@ -4168,7 +4371,10 @@ def completion_cost(
|
||||||
model = get_model_params_and_category(model)
|
model = get_model_params_and_category(model)
|
||||||
# replicate llms are calculate based on time for request running
|
# replicate llms are calculate based on time for request running
|
||||||
# see https://replicate.com/pricing
|
# see https://replicate.com/pricing
|
||||||
elif model in litellm.replicate_models or "replicate" in model:
|
elif (
|
||||||
|
model in litellm.replicate_models or "replicate" in model
|
||||||
|
) and model not in litellm.model_cost:
|
||||||
|
# for unmapped replicate model, default to replicate's time tracking logic
|
||||||
return get_replicate_completion_pricing(completion_response, total_time)
|
return get_replicate_completion_pricing(completion_response, total_time)
|
||||||
|
|
||||||
(
|
(
|
||||||
|
@ -4554,7 +4760,36 @@ def get_optional_params(
|
||||||
k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
|
k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
|
||||||
): # allow dynamically setting vertex ai init logic
|
): # allow dynamically setting vertex ai init logic
|
||||||
continue
|
continue
|
||||||
|
|
||||||
passed_params[k] = v
|
passed_params[k] = v
|
||||||
|
|
||||||
|
optional_params = {}
|
||||||
|
|
||||||
|
common_auth_dict = litellm.common_cloud_provider_auth_params
|
||||||
|
if custom_llm_provider in common_auth_dict["providers"]:
|
||||||
|
"""
|
||||||
|
Check if params = ["project", "region_name", "token"]
|
||||||
|
and correctly translate for = ["azure", "vertex_ai", "watsonx", "aws"]
|
||||||
|
"""
|
||||||
|
if custom_llm_provider == "azure":
|
||||||
|
optional_params = litellm.AzureOpenAIConfig().map_special_auth_params(
|
||||||
|
non_default_params=passed_params, optional_params=optional_params
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "bedrock":
|
||||||
|
optional_params = (
|
||||||
|
litellm.AmazonBedrockGlobalConfig().map_special_auth_params(
|
||||||
|
non_default_params=passed_params, optional_params=optional_params
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "vertex_ai":
|
||||||
|
optional_params = litellm.VertexAIConfig().map_special_auth_params(
|
||||||
|
non_default_params=passed_params, optional_params=optional_params
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
optional_params = litellm.IBMWatsonXAIConfig().map_special_auth_params(
|
||||||
|
non_default_params=passed_params, optional_params=optional_params
|
||||||
|
)
|
||||||
|
|
||||||
default_params = {
|
default_params = {
|
||||||
"functions": None,
|
"functions": None,
|
||||||
"function_call": None,
|
"function_call": None,
|
||||||
|
@ -4590,7 +4825,7 @@ def get_optional_params(
|
||||||
and v != default_params[k]
|
and v != default_params[k]
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
optional_params = {}
|
|
||||||
## raise exception if function calling passed in for a provider that doesn't support it
|
## raise exception if function calling passed in for a provider that doesn't support it
|
||||||
if (
|
if (
|
||||||
"functions" in non_default_params
|
"functions" in non_default_params
|
||||||
|
@ -5268,7 +5503,8 @@ def get_optional_params(
|
||||||
optional_params["tools"] = tools
|
optional_params["tools"] = tools
|
||||||
if tool_choice is not None:
|
if tool_choice is not None:
|
||||||
optional_params["tool_choice"] = tool_choice
|
optional_params["tool_choice"] = tool_choice
|
||||||
|
if response_format is not None:
|
||||||
|
optional_params["response_format"] = response_format
|
||||||
# check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
|
# check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
|
||||||
safe_mode = passed_params.pop("safe_mode", None)
|
safe_mode = passed_params.pop("safe_mode", None)
|
||||||
random_seed = passed_params.pop("random_seed", None)
|
random_seed = passed_params.pop("random_seed", None)
|
||||||
|
@ -5280,6 +5516,7 @@ def get_optional_params(
|
||||||
optional_params["extra_body"] = (
|
optional_params["extra_body"] = (
|
||||||
extra_body # openai client supports `extra_body` param
|
extra_body # openai client supports `extra_body` param
|
||||||
)
|
)
|
||||||
|
|
||||||
elif custom_llm_provider == "groq":
|
elif custom_llm_provider == "groq":
|
||||||
supported_params = get_supported_openai_params(
|
supported_params = get_supported_openai_params(
|
||||||
model=model, custom_llm_provider=custom_llm_provider
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
@ -5360,6 +5597,49 @@ def get_optional_params(
|
||||||
optional_params["extra_body"] = (
|
optional_params["extra_body"] = (
|
||||||
extra_body # openai client supports `extra_body` param
|
extra_body # openai client supports `extra_body` param
|
||||||
)
|
)
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
if max_tokens is not None:
|
||||||
|
optional_params["max_new_tokens"] = max_tokens
|
||||||
|
if stream:
|
||||||
|
optional_params["stream"] = stream
|
||||||
|
if temperature is not None:
|
||||||
|
optional_params["temperature"] = temperature
|
||||||
|
if top_p is not None:
|
||||||
|
optional_params["top_p"] = top_p
|
||||||
|
if frequency_penalty is not None:
|
||||||
|
optional_params["repetition_penalty"] = frequency_penalty
|
||||||
|
if seed is not None:
|
||||||
|
optional_params["random_seed"] = seed
|
||||||
|
if stop is not None:
|
||||||
|
optional_params["stop_sequences"] = stop
|
||||||
|
|
||||||
|
# WatsonX-only parameters
|
||||||
|
extra_body = {}
|
||||||
|
if "decoding_method" in passed_params:
|
||||||
|
extra_body["decoding_method"] = passed_params.pop("decoding_method")
|
||||||
|
if "min_tokens" in passed_params or "min_new_tokens" in passed_params:
|
||||||
|
extra_body["min_new_tokens"] = passed_params.pop(
|
||||||
|
"min_tokens", passed_params.pop("min_new_tokens")
|
||||||
|
)
|
||||||
|
if "top_k" in passed_params:
|
||||||
|
extra_body["top_k"] = passed_params.pop("top_k")
|
||||||
|
if "truncate_input_tokens" in passed_params:
|
||||||
|
extra_body["truncate_input_tokens"] = passed_params.pop(
|
||||||
|
"truncate_input_tokens"
|
||||||
|
)
|
||||||
|
if "length_penalty" in passed_params:
|
||||||
|
extra_body["length_penalty"] = passed_params.pop("length_penalty")
|
||||||
|
if "time_limit" in passed_params:
|
||||||
|
extra_body["time_limit"] = passed_params.pop("time_limit")
|
||||||
|
if "return_options" in passed_params:
|
||||||
|
extra_body["return_options"] = passed_params.pop("return_options")
|
||||||
|
optional_params["extra_body"] = (
|
||||||
|
extra_body # openai client supports `extra_body` param
|
||||||
|
)
|
||||||
else: # assume passing in params for openai/azure openai
|
else: # assume passing in params for openai/azure openai
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
|
f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
|
||||||
|
@ -5762,6 +6042,8 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
]
|
]
|
||||||
|
elif custom_llm_provider == "watsonx":
|
||||||
|
return litellm.IBMWatsonXAIConfig().get_supported_openai_params()
|
||||||
|
|
||||||
|
|
||||||
def get_formatted_prompt(
|
def get_formatted_prompt(
|
||||||
|
@ -5989,6 +6271,8 @@ def get_llm_provider(
|
||||||
model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
|
model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
|
||||||
):
|
):
|
||||||
custom_llm_provider = "bedrock"
|
custom_llm_provider = "bedrock"
|
||||||
|
elif model in litellm.watsonx_models:
|
||||||
|
custom_llm_provider = "watsonx"
|
||||||
# openai embeddings
|
# openai embeddings
|
||||||
elif model in litellm.open_ai_embedding_models:
|
elif model in litellm.open_ai_embedding_models:
|
||||||
custom_llm_provider = "openai"
|
custom_llm_provider = "openai"
|
||||||
|
@ -6453,7 +6737,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
|
||||||
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
|
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
else:
|
else:
|
||||||
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
|
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_LOCATION"])
|
||||||
elif custom_llm_provider == "huggingface":
|
elif custom_llm_provider == "huggingface":
|
||||||
if "HUGGINGFACE_API_KEY" in os.environ:
|
if "HUGGINGFACE_API_KEY" in os.environ:
|
||||||
keys_in_environment = True
|
keys_in_environment = True
|
||||||
|
@ -6579,11 +6863,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
|
||||||
|
|
||||||
def set_callbacks(callback_list, function_id=None):
|
def set_callbacks(callback_list, function_id=None):
|
||||||
|
|
||||||
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger
|
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for callback in callback_list:
|
for callback in callback_list:
|
||||||
print_verbose(f"callback: {callback}")
|
print_verbose(f"init callback list: {callback}")
|
||||||
if callback == "sentry":
|
if callback == "sentry":
|
||||||
try:
|
try:
|
||||||
import sentry_sdk
|
import sentry_sdk
|
||||||
|
@ -6646,6 +6930,8 @@ def set_callbacks(callback_list, function_id=None):
|
||||||
promptLayerLogger = PromptLayerLogger()
|
promptLayerLogger = PromptLayerLogger()
|
||||||
elif callback == "langfuse":
|
elif callback == "langfuse":
|
||||||
langFuseLogger = LangFuseLogger()
|
langFuseLogger = LangFuseLogger()
|
||||||
|
elif callback == "openmeter":
|
||||||
|
openMeterLogger = OpenMeterLogger()
|
||||||
elif callback == "datadog":
|
elif callback == "datadog":
|
||||||
dataDogLogger = DataDogLogger()
|
dataDogLogger = DataDogLogger()
|
||||||
elif callback == "prometheus":
|
elif callback == "prometheus":
|
||||||
|
@ -6982,6 +7268,7 @@ def convert_to_model_response_object(
|
||||||
end_time=None,
|
end_time=None,
|
||||||
hidden_params: Optional[dict] = None,
|
hidden_params: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
|
received_args = locals()
|
||||||
try:
|
try:
|
||||||
if response_type == "completion" and (
|
if response_type == "completion" and (
|
||||||
model_response_object is None
|
model_response_object is None
|
||||||
|
@ -6993,6 +7280,11 @@ def convert_to_model_response_object(
|
||||||
# for returning cached responses, we need to yield a generator
|
# for returning cached responses, we need to yield a generator
|
||||||
return convert_to_streaming_response(response_object=response_object)
|
return convert_to_streaming_response(response_object=response_object)
|
||||||
choice_list = []
|
choice_list = []
|
||||||
|
|
||||||
|
assert response_object["choices"] is not None and isinstance(
|
||||||
|
response_object["choices"], Iterable
|
||||||
|
)
|
||||||
|
|
||||||
for idx, choice in enumerate(response_object["choices"]):
|
for idx, choice in enumerate(response_object["choices"]):
|
||||||
message = Message(
|
message = Message(
|
||||||
content=choice["message"].get("content", None),
|
content=choice["message"].get("content", None),
|
||||||
|
@ -7036,6 +7328,7 @@ def convert_to_model_response_object(
|
||||||
model_response_object.model = response_object["model"]
|
model_response_object.model = response_object["model"]
|
||||||
|
|
||||||
if start_time is not None and end_time is not None:
|
if start_time is not None and end_time is not None:
|
||||||
|
if isinstance(start_time, type(end_time)):
|
||||||
model_response_object._response_ms = ( # type: ignore
|
model_response_object._response_ms = ( # type: ignore
|
||||||
end_time - start_time
|
end_time - start_time
|
||||||
).total_seconds() * 1000
|
).total_seconds() * 1000
|
||||||
|
@ -7113,7 +7406,9 @@ def convert_to_model_response_object(
|
||||||
model_response_object._hidden_params = hidden_params
|
model_response_object._hidden_params = hidden_params
|
||||||
return model_response_object
|
return model_response_object
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Invalid response object {traceback.format_exc()}")
|
raise Exception(
|
||||||
|
f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
||||||
|
@ -7940,7 +8235,10 @@ def exception_type(
|
||||||
llm_provider="vertex_ai",
|
llm_provider="vertex_ai",
|
||||||
response=original_exception.response,
|
response=original_exception.response,
|
||||||
)
|
)
|
||||||
elif "None Unknown Error." in error_str:
|
elif (
|
||||||
|
"None Unknown Error." in error_str
|
||||||
|
or "Content has no parts." in error_str
|
||||||
|
):
|
||||||
exception_mapping_worked = True
|
exception_mapping_worked = True
|
||||||
raise APIError(
|
raise APIError(
|
||||||
message=f"VertexAIException - {error_str}",
|
message=f"VertexAIException - {error_str}",
|
||||||
|
@ -9393,9 +9691,14 @@ class CustomStreamWrapper:
|
||||||
is_finished = True
|
is_finished = True
|
||||||
finish_reason = str_line.choices[0].finish_reason
|
finish_reason = str_line.choices[0].finish_reason
|
||||||
if finish_reason == "content_filter":
|
if finish_reason == "content_filter":
|
||||||
|
if hasattr(str_line.choices[0], "content_filter_result"):
|
||||||
error_message = json.dumps(
|
error_message = json.dumps(
|
||||||
str_line.choices[0].content_filter_result
|
str_line.choices[0].content_filter_result
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
error_message = "Azure Response={}".format(
|
||||||
|
str(dict(str_line))
|
||||||
|
)
|
||||||
raise litellm.AzureOpenAIError(
|
raise litellm.AzureOpenAIError(
|
||||||
status_code=400, message=error_message
|
status_code=400, message=error_message
|
||||||
)
|
)
|
||||||
|
@ -9683,6 +9986,39 @@ class CustomStreamWrapper:
|
||||||
"finish_reason": finish_reason,
|
"finish_reason": finish_reason,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def handle_watsonx_stream(self, chunk):
|
||||||
|
try:
|
||||||
|
if isinstance(chunk, dict):
|
||||||
|
parsed_response = chunk
|
||||||
|
elif isinstance(chunk, (str, bytes)):
|
||||||
|
if isinstance(chunk, bytes):
|
||||||
|
chunk = chunk.decode("utf-8")
|
||||||
|
if "generated_text" in chunk:
|
||||||
|
response = chunk.replace("data: ", "").strip()
|
||||||
|
parsed_response = json.loads(response)
|
||||||
|
else:
|
||||||
|
return {"text": "", "is_finished": False}
|
||||||
|
else:
|
||||||
|
print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
|
||||||
|
raise ValueError(
|
||||||
|
f"Unable to parse response. Original response: {chunk}"
|
||||||
|
)
|
||||||
|
results = parsed_response.get("results", [])
|
||||||
|
if len(results) > 0:
|
||||||
|
text = results[0].get("generated_text", "")
|
||||||
|
finish_reason = results[0].get("stop_reason")
|
||||||
|
is_finished = finish_reason != "not_finished"
|
||||||
|
return {
|
||||||
|
"text": text,
|
||||||
|
"is_finished": is_finished,
|
||||||
|
"finish_reason": finish_reason,
|
||||||
|
"prompt_tokens": results[0].get("input_token_count", None),
|
||||||
|
"completion_tokens": results[0].get("generated_token_count", None),
|
||||||
|
}
|
||||||
|
return {"text": "", "is_finished": False}
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
def model_response_creator(self):
|
def model_response_creator(self):
|
||||||
model_response = ModelResponse(stream=True, model=self.model)
|
model_response = ModelResponse(stream=True, model=self.model)
|
||||||
if self.response_id is not None:
|
if self.response_id is not None:
|
||||||
|
@ -9938,6 +10274,11 @@ class CustomStreamWrapper:
|
||||||
print_verbose(f"completion obj content: {completion_obj['content']}")
|
print_verbose(f"completion obj content: {completion_obj['content']}")
|
||||||
if response_obj["is_finished"]:
|
if response_obj["is_finished"]:
|
||||||
self.received_finish_reason = response_obj["finish_reason"]
|
self.received_finish_reason = response_obj["finish_reason"]
|
||||||
|
elif self.custom_llm_provider == "watsonx":
|
||||||
|
response_obj = self.handle_watsonx_stream(chunk)
|
||||||
|
completion_obj["content"] = response_obj["text"]
|
||||||
|
if response_obj["is_finished"]:
|
||||||
|
self.received_finish_reason = response_obj["finish_reason"]
|
||||||
elif self.custom_llm_provider == "text-completion-openai":
|
elif self.custom_llm_provider == "text-completion-openai":
|
||||||
response_obj = self.handle_openai_text_completion_chunk(chunk)
|
response_obj = self.handle_openai_text_completion_chunk(chunk)
|
||||||
completion_obj["content"] = response_obj["text"]
|
completion_obj["content"] = response_obj["text"]
|
||||||
|
@ -10123,12 +10464,23 @@ class CustomStreamWrapper:
|
||||||
model_response.id = original_chunk.id
|
model_response.id = original_chunk.id
|
||||||
self.response_id = original_chunk.id
|
self.response_id = original_chunk.id
|
||||||
if len(original_chunk.choices) > 0:
|
if len(original_chunk.choices) > 0:
|
||||||
|
choices = []
|
||||||
|
for idx, choice in enumerate(original_chunk.choices):
|
||||||
try:
|
try:
|
||||||
delta = dict(original_chunk.choices[0].delta)
|
if isinstance(choice, BaseModel):
|
||||||
print_verbose(f"original delta: {delta}")
|
try:
|
||||||
model_response.choices[0].delta = Delta(**delta)
|
choice_json = choice.model_dump()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
model_response.choices[0].delta = Delta()
|
choice_json = choice.dict()
|
||||||
|
choice_json.pop(
|
||||||
|
"finish_reason", None
|
||||||
|
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
|
||||||
|
print_verbose(f"choice_json: {choice_json}")
|
||||||
|
choices.append(StreamingChoices(**choice_json))
|
||||||
|
except Exception as e:
|
||||||
|
choices.append(StreamingChoices())
|
||||||
|
print_verbose(f"choices in streaming: {choices}")
|
||||||
|
model_response.choices = choices
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
model_response.system_fingerprint = (
|
model_response.system_fingerprint = (
|
||||||
|
@ -10173,11 +10525,11 @@ class CustomStreamWrapper:
|
||||||
)
|
)
|
||||||
self.holding_chunk = ""
|
self.holding_chunk = ""
|
||||||
# if delta is None
|
# if delta is None
|
||||||
is_delta_empty = self.is_delta_empty(
|
_is_delta_empty = self.is_delta_empty(
|
||||||
delta=model_response.choices[0].delta
|
delta=model_response.choices[0].delta
|
||||||
)
|
)
|
||||||
|
|
||||||
if is_delta_empty:
|
if _is_delta_empty:
|
||||||
# get any function call arguments
|
# get any function call arguments
|
||||||
model_response.choices[0].finish_reason = map_finish_reason(
|
model_response.choices[0].finish_reason = map_finish_reason(
|
||||||
finish_reason=self.received_finish_reason
|
finish_reason=self.received_finish_reason
|
||||||
|
|
|
@ -1418,6 +1418,123 @@
|
||||||
"litellm_provider": "replicate",
|
"litellm_provider": "replicate",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"replicate/meta/llama-2-13b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-13b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-instruct-v0.2": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000003,
|
||||||
|
"output_cost_per_token": 0.000001,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"openrouter/openai/gpt-3.5-turbo": {
|
"openrouter/openai/gpt-3.5-turbo": {
|
||||||
"max_tokens": 4095,
|
"max_tokens": 4095,
|
||||||
"input_cost_per_token": 0.0000015,
|
"input_cost_per_token": 0.0000015,
|
||||||
|
@ -1455,6 +1572,17 @@
|
||||||
"litellm_provider": "openrouter",
|
"litellm_provider": "openrouter",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"openrouter/anthropic/claude-3-opus": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 200000,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.000015,
|
||||||
|
"output_cost_per_token": 0.000075,
|
||||||
|
"litellm_provider": "openrouter",
|
||||||
|
"mode": "chat",
|
||||||
|
"supports_function_calling": true,
|
||||||
|
"tool_use_system_prompt_tokens": 395
|
||||||
|
},
|
||||||
"openrouter/google/palm-2-chat-bison": {
|
"openrouter/google/palm-2-chat-bison": {
|
||||||
"max_tokens": 8000,
|
"max_tokens": 8000,
|
||||||
"input_cost_per_token": 0.0000005,
|
"input_cost_per_token": 0.0000005,
|
||||||
|
@ -2379,6 +2507,24 @@
|
||||||
"litellm_provider": "bedrock",
|
"litellm_provider": "bedrock",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"meta.llama3-8b-instruct-v1:0": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"max_input_tokens": 8192,
|
||||||
|
"max_output_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.0000004,
|
||||||
|
"output_cost_per_token": 0.0000006,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"meta.llama3-70b-instruct-v1:0": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"max_input_tokens": 8192,
|
||||||
|
"max_output_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.00000265,
|
||||||
|
"output_cost_per_token": 0.0000035,
|
||||||
|
"litellm_provider": "bedrock",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
||||||
"max_tokens": 77,
|
"max_tokens": 77,
|
||||||
"max_input_tokens": 77,
|
"max_input_tokens": 77,
|
||||||
|
|
|
@ -61,14 +61,14 @@ model_list:
|
||||||
api_key: my-fake-key
|
api_key: my-fake-key
|
||||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||||
stream_timeout: 0.001
|
stream_timeout: 0.001
|
||||||
rpm: 10
|
rpm: 100
|
||||||
- model_name: fake-openai-endpoint-3
|
- model_name: fake-openai-endpoint-3
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/my-fake-model-2
|
model: openai/my-fake-model-2
|
||||||
api_key: my-fake-key
|
api_key: my-fake-key
|
||||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||||
stream_timeout: 0.001
|
stream_timeout: 0.001
|
||||||
rpm: 10
|
rpm: 100
|
||||||
- model_name: "*"
|
- model_name: "*"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/*
|
model: openai/*
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "1.35.27"
|
version = "1.35.36"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "1.35.27"
|
version = "1.35.36"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:^version"
|
"pyproject.toml:^version"
|
||||||
]
|
]
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue