Merge branch 'main' into main

This commit is contained in:
Lucca Zenóbio 2024-05-02 09:46:34 -03:00 committed by GitHub
commit 78303b79ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
124 changed files with 6716 additions and 1078 deletions

View file

@ -40,7 +40,7 @@ jobs:
pip install "aioboto3==12.3.0"
pip install langchain
pip install lunary==0.2.5
pip install "langfuse==2.7.3"
pip install "langfuse==2.27.1"
pip install numpydoc
pip install traceloop-sdk==0.0.69
pip install openai

1
.gitignore vendored
View file

@ -51,3 +51,4 @@ loadtest_kub.yaml
litellm/proxy/_new_secret_config.yaml
litellm/proxy/_new_secret_config.yaml
litellm/proxy/_super_secret_config.yaml
litellm/proxy/_super_secret_config.yaml

View file

@ -7,7 +7,7 @@ repos:
rev: 7.0.0 # The version of flake8 to use
hooks:
- id: flake8
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
additional_dependencies: [flake8-print]
files: litellm/.*\.py
- repo: local

View file

@ -227,6 +227,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ |

300
cookbook/liteLLM_IBM_Watsonx.ipynb vendored Normal file

File diff suppressed because one or more lines are too long

View file

@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
response = completion("command-nightly", messages)
```
## JSON Logs
If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
We currently just log the raw POST request from litellm as a JSON - [**See Code**].
[Share feedback here](https://github.com/BerriAI/litellm/issues)
## Logger Function
But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set?

View file

@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())
```
## Multi-Instance TPM/RPM Load Test (Router)
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on router = 200 requests per minute (2 deployments)
- Load we'll send through router = 600 requests per minute
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### Code
Let's hit the router with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
```python
from litellm import Router
import litellm
litellm.suppress_debug_info = True
litellm.set_verbose = False
import logging
logging.basicConfig(level=logging.CRITICAL)
import os, random, uuid, time, asyncio
# Model list for OpenAI and Anthropic models
model_list = [
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8080",
"rpm": 100
},
},
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8081",
"rpm": 100
},
},
]
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
async def router_completion_non_streaming():
try:
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.acompletion(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [router_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
## Multi-Instance TPM/RPM Load Test (Proxy)
Test if your defined tpm/rpm limits are respected across multiple instances.
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
- Load we'll send to proxy = 600 requests per minute
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### 1. Setup config
```yaml
model_list:
- litellm_params:
api_base: http://0.0.0.0:8080
api_key: my-fake-key
model: openai/my-fake-model
rpm: 100
model_name: fake-openai-endpoint
- litellm_params:
api_base: http://0.0.0.0:8081
api_key: my-fake-key
model: openai/my-fake-model-2
rpm: 100
model_name: fake-openai-endpoint
router_settings:
num_retries: 0
enable_pre_call_checks: true
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
routing_strategy: usage-based-routing-v2
```
### 2. Start proxy 2 instances
**Instance 1**
```bash
litellm --config /path/to/config.yaml --port 4000
## RUNNING on http://0.0.0.0:4000
```
**Instance 2**
```bash
litellm --config /path/to/config.yaml --port 4001
## RUNNING on http://0.0.0.0:4001
```
### 3. Run Test
Let's hit the proxy with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
```python
from openai import AsyncOpenAI, AsyncAzureOpenAI
import random, uuid
import time, asyncio, litellm
# import logging
# logging.basicConfig(level=logging.DEBUG)
#### LITELLM PROXY ####
litellm_client = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4000"
)
litellm_client_2 = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4001"
)
async def proxy_completion_non_streaming():
try:
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.chat.completions.create(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [proxy_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
### Extra - Setup Fake OpenAI Server
Let's setup a fake openai server with a RPM limit of 100.
Let's call our file `fake_openai_server.py`.
```
# import sys, os
# sys.path.insert(
# 0, os.path.abspath("../")
# ) # Adds the parent directory to the system path
from fastapi import FastAPI, Request, status, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import OAuth2PasswordBearer
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
import httpx, os, json
from openai import AsyncOpenAI
from typing import Optional
from slowapi import Limiter
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
class ProxyException(Exception):
# NOTE: DO NOT MODIFY THIS
# This is used to map exactly to OPENAI Exceptions
def __init__(
self,
message: str,
type: str,
param: Optional[str],
code: Optional[int],
):
self.message = message
self.type = type
self.param = param
self.code = code
def to_dict(self) -> dict:
"""Converts the ProxyException instance to a dictionary."""
return {
"message": self.message,
"type": self.type,
"param": self.param,
"code": self.code,
}
limiter = Limiter(key_func=get_remote_address)
app = FastAPI()
app.state.limiter = limiter
@app.exception_handler(RateLimitExceeded)
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
return JSONResponse(status_code=429,
content={"detail": "Rate Limited!"})
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
@limiter.limit("100/minute")
async def completion(request: Request):
# raise HTTPException(status_code=429, detail="Rate Limited!")
return {
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": None,
"system_fingerprint": "fp_44709d6fcb",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "\n\nHello there, how may I assist you today?",
},
"logprobs": None,
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 9,
"completion_tokens": 12,
"total_tokens": 21
}
}
if __name__ == "__main__":
import socket
import uvicorn
port = 8080
while True:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
print(f"Port {port} is available, starting server...")
break
else:
port += 1
uvicorn.run(app, host="0.0.0.0", port=port)
```
```bash
python3 fake_openai_server.py
```

View file

@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
## Examples
### Custom Callback to track costs for Streaming + Non-Streaming
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
```python
# Step 1. Write your custom callback function
def track_cost_callback(
kwargs, # kwargs to completion
completion_response, # response from completion
start_time, end_time # start/end time
):
try:
# init logging config
logging.basicConfig(
filename='cost.log',
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# check if it has collected an entire stream response
if "complete_streaming_response" in kwargs:
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
completion_response=kwargs["complete_streaming_response"]
input_text = kwargs["messages"]
output_text = completion_response["choices"][0]["message"]["content"]
response_cost = litellm.completion_cost(
model = kwargs["model"],
messages = input_text,
completion=output_text
)
print("streaming response_cost", response_cost)
logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
# for non streaming responses
else:
# we pass the completion_response obj
if kwargs["stream"] != True:
response_cost = litellm.completion_cost(completion_response=completion_response)
response_cost = kwargs["response_cost"] # litellm calculates response cost for you
print("regular response_cost", response_cost)
logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
except:
pass
# Assign the custom callback function
# Step 2. Assign the custom callback function
litellm.success_callback = [track_cost_callback]
# Step 3. Make litellm.completion call
response = completion(
model="gpt-3.5-turbo",
messages=[

View file

@ -121,10 +121,12 @@ response = completion(
metadata={
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
"generation_id": "gen-id22", # set langfuse Generation ID
"trace_id": "trace-id22", # set langfuse Trace ID
"trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"] # set langfuse Tags
"trace_id": "trace-id22", # set langfuse Trace ID
### OR ###
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
},
)
@ -167,6 +169,9 @@ messages = [
chat(messages)
```
## Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
## Troubleshooting & Errors
### Data not getting logged to Langfuse ?

View file

@ -0,0 +1,97 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenMeter - Usage-Based Billing
[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
<Image img={require('../../img/openmeter.png')} />
:::info
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
join our [discord](https://discord.gg/wuPM9dRgDw)
:::
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
Get your OpenMeter API Key from https://openmeter.cloud/meters
```python
litellm.success_callback = ["openmeter"] # logs cost + usage of successful calls to openmeter
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# pip install langfuse
import litellm
import os
# from https://openmeter.cloud
os.environ["OPENMETER_API_ENDPOINT"] = ""
os.environ["OPENMETER_API_KEY"] = ""
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set langfuse as a callback, litellm will send the data to langfuse
litellm.success_callback = ["openmeter"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
success_callback: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
</TabItem>
</Tabs>
<Image img={require('../../img/openmeter_img_2.png')} />

View file

@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
print(response)
```
## Redacting Messages, Response Content from Sentry Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
[Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry.

View file

@ -53,6 +53,50 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` |
## Function Calling
```python
from litellm import completion
# set env
os.environ["MISTRAL_API_KEY"] = "your-api-key"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="mistral/mistral-large-latest",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
## Sample Usage - Embedding
```python
from litellm import embedding

View file

@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.
🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
:::info
To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
:::
### Quick Start
```
pip install litellm vllm

View file

@ -0,0 +1,284 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# IBM watsonx.ai
LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
## Environment Variables
```python
os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance
# (required) either one of the following:
os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
os.environ["WATSONX_TOKEN"] = "" # IAM auth token
# optional - can also be passed as params to completion() or embedding()
os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
```
See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
## Usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
)
response = completion(
model="watsonx/meta-llama/llama-3-8b-instruct",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>"
)
```
## Usage - Streaming
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
os.environ["WATSONX_PROJECT_ID"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
stream=True
)
for chunk in response:
print(chunk)
```
#### Example Streaming Output Chunk
```json
{
"choices": [
{
"finish_reason": null,
"index": 0,
"delta": {
"content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
}
}
],
"created": null,
"model": "watsonx/ibm/granite-13b-chat-v2",
"usage": {
"prompt_tokens": null,
"completion_tokens": null,
"total_tokens": null
}
}
```
## Usage - Models in deployment spaces
Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space).
The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`.
```python
import litellm
response = litellm.completion(
model="watsonx/deployment/<deployment_id>",
messages=[{"content": "Hello, how are you?", "role": "user"}],
space_id="<deployment_space_id>"
)
```
## Usage - Embeddings
LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
```python
from litellm import embedding
response = embedding(
model="watsonx/ibm/slate-30m-english-rtrvr",
input=["What is the capital of France?"],
project_id="<my-project-id>"
)
print(response)
# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
```
## OpenAI Proxy Usage
Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
### 1. Save keys in your environment
```bash
export WATSONX_URL=""
export WATSONX_APIKEY=""
export WATSONX_PROJECT_ID=""
```
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="CLI">
```bash
$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml
model_list:
- model_name: llama-3-8b
litellm_params:
# all params accepted by litellm.completion()
model: watsonx/meta-llama/llama-3-8b-instruct
api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
```
</TabItem>
</Tabs>
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "llama-3-8b",
"messages": [
{
"role": "user",
"content": "what is your favorite colour?"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="llama-3-8b", messages=[
{
"role": "user",
"content": "what is your favorite colour?"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "llama-3-8b",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Authentication
### Passing credentials as parameters
You can also pass the credentials as parameters to the completion and embedding functions.
```python
import os
from litellm import completion
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "What is your favorite color?","role": "user"}],
url="",
api_key="",
project_id=""
)
```
## Supported IBM watsonx.ai Models
Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
| Mode Name | Command |
| ---------- | --------- |
| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
## Supported IBM watsonx.ai Embedding Models
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).

View file

@ -1,13 +1,13 @@
# Slack Alerting
# 🚨 Alerting
Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- Hanging LLM api calls
- Failed LLM api calls
- Slow LLM api calls
- Budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
- Failed db read/writes
## Quick Start

View file

@ -62,9 +62,11 @@ model_list:
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
```
:::info

View file

@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber
<TabItem value="basic" label="Basic">
**Step 1. Create a file called `litellm_config.yaml`**
### Step 1. CREATE config.yaml
Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
```yaml
model_list:
Example `litellm_config.yaml`
```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-azure-model-deployment>
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
api_version: "2023-07-01-preview"
```
```
**Step 2. Run litellm docker image**
See the latest available ghcr docker image here:
https://github.com/berriai/litellm/pkgs/container/litellm
Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command.
The `-v` command will mount that file
### Step 2. RUN Docker Image
Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
```shell
docker run \
```shell
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug
```
```
**Step 3. Send a Test Request**
Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
### Step 3. TEST Request
Pass `model=azure-gpt-3.5` this was set on step 1
@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
| Docs | When to Use |
| --- | --- |
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
## Deploy with Database
### Docker, Kubernetes, Helm Chart
Requirements:
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)
<Tabs>
@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
```shell
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e LITELLM_MASTER_KEY=sk-1234 \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \
@ -267,12 +269,12 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
#### Step 1. Create deployment.yaml
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
spec:
replicas: 1
spec:
replicas: 3
selector:
matchLabels:
app: litellm
@ -283,10 +285,47 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
spec:
containers:
- name: litellm-container
image: ghcr.io/berriai/litellm-database:main-latest
image: ghcr.io/berriai/litellm:main-latest
imagePullPolicy: Always
env:
- name: AZURE_API_KEY
value: "d6******"
- name: AZURE_API_BASE
value: "https://ope******"
- name: LITELLM_MASTER_KEY
value: "sk-1234"
- name: DATABASE_URL
value: postgresql://<user>:<password>@<host>:<port>/<dbname>
value: "po**********"
args:
- "--config"
- "/app/proxy_config.yaml" # Update the path to mount the config file
volumeMounts: # Define volume mount for proxy_config.yaml
- name: config-volume
mountPath: /app
readOnly: true
livenessProbe:
httpGet:
path: /health/liveliness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /health/readiness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
volumes: # Define volume to mount proxy_config.yaml
- name: config-volume
configMap:
name: litellm-config
```
```bash

View file

@ -10,6 +10,7 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme
- [Async Custom Callbacks](#custom-callback-class-async)
- [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
@ -401,7 +402,7 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse
@ -419,7 +420,13 @@ litellm_settings:
success_callback: ["langfuse"]
```
**Step 3**: Start the proxy, make a test request
**Step 3**: Set required env variables for logging to langfuse
```shell
export LANGFUSE_PUBLIC_KEY="pk_kk"
export LANGFUSE_SECRET_KEY="sk_ss
```
**Step 4**: Start the proxy, make a test request
Start proxy
```shell
@ -569,6 +576,75 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
All requests made with these keys will log data to their team-specific logging.
### Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["langfuse"]
turn_off_message_logging: True
```
## Logging Proxy Cost + Usage - OpenMeter
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
**Required Env Variables**
```bash
# from https://openmeter.cloud
export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
export OPENMETER_API_KEY=""
```
### Quick Start
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
success_callback: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
<Image img={require('../../img/openmeter_img_2.png')} />
## Logging Proxy Input/Output - DataDog
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

View file

@ -95,7 +95,7 @@ print(response)
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
- `router.aimage_generation()` - async image generation calls
### Advanced - Routing Strategies
## Advanced - Routing Strategies
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
Router provides 4 strategies for routing your calls across multiple deployments:
@ -278,6 +278,36 @@ router_settings:
routing_strategy_args: {"ttl": 10}
```
### Set Lowest Latency Buffer
Set a buffer within which deployments are candidates for making calls to.
E.g.
if you have 5 deployments
```
https://litellm-prod-1.openai.azure.com/: 0.07s
https://litellm-prod-2.openai.azure.com/: 0.1s
https://litellm-prod-3.openai.azure.com/: 0.1s
https://litellm-prod-4.openai.azure.com/: 0.1s
https://litellm-prod-5.openai.azure.com/: 4.66s
```
to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`.
**In Router**
```python
router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
```
**In Proxy**
```yaml
router_settings:
routing_strategy_args: {"lowest_latency_buffer": 0.5}
```
</TabItem>
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
@ -443,6 +473,35 @@ asyncio.run(router_acompletion())
## Basic Reliability
### Max Parallel Requests (ASYNC)
Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios.
If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit.
```python
from litellm import Router
model_list = [{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
...
"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
}
}]
### OR ###
router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS
# deployment max parallel requests > default max parallel requests
```
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
### Timeouts
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.

View file

@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
* API Base
* API Version
* API Type
* Project
* Location
* Token
Useful Helper functions:
* [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
```
### Setting Project, Location, Token
For cloud providers:
- Azure
- Bedrock
- GCP
- Watson AI
you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers.
| | LiteLLM param | Watson | Vertex AI | Azure | Bedrock |
|------|--------------|--------------|--------------|--------------|--------------|
| Project | project | watsonx_project | vertex_project | n/a | n/a |
| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
If you want, you can call them by their provider-specific params as well.
## litellm variables
### litellm.api_key

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 533 KiB

View file

@ -43,6 +43,12 @@ const sidebars = {
"proxy/user_keys",
"proxy/enterprise",
"proxy/virtual_keys",
"proxy/alerting",
{
type: "category",
label: "Logging",
items: ["proxy/logging", "proxy/streaming_logging"],
},
"proxy/team_based_routing",
"proxy/ui",
"proxy/cost_tracking",
@ -58,11 +64,6 @@ const sidebars = {
"proxy/pii_masking",
"proxy/prompt_injection",
"proxy/caching",
{
type: "category",
label: "Logging, Alerting",
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
},
"proxy/prometheus",
"proxy/call_hooks",
"proxy/rules",
@ -148,6 +149,7 @@ const sidebars = {
"providers/openrouter",
"providers/custom_openai_proxy",
"providers/petals",
"providers/watsonx",
],
},
"proxy/custom_pricing",
@ -168,6 +170,7 @@ const sidebars = {
"observability/custom_callback",
"observability/langfuse_integration",
"observability/sentry",
"observability/openmeter",
"observability/promptlayer_integration",
"observability/wandb_integration",
"observability/langsmith_integration",
@ -175,7 +178,6 @@ const sidebars = {
"observability/traceloop_integration",
"observability/athina_integration",
"observability/lunary_integration",
"observability/athina_integration",
"observability/helicone_integration",
"observability/supabase_integration",
`observability/telemetry`,

View file

@ -6,7 +6,7 @@
"": {
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
"hono": "^4.2.7"
},
"devDependencies": {
"@types/node": "^20.11.17",
@ -463,9 +463,9 @@
}
},
"node_modules/hono": {
"version": "4.1.5",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
"version": "4.2.7",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
"integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
"engines": {
"node": ">=16.0.0"
}

View file

@ -4,7 +4,7 @@
},
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
"hono": "^4.2.7"
},
"devDependencies": {
"@types/node": "^20.11.17",

View file

@ -2,7 +2,7 @@
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any, Literal
from litellm.caching import Cache
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
from litellm.proxy._types import (
KeyManagementSystem,
KeyManagementSettings,
@ -22,6 +22,7 @@ success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
service_callback: List[Union[str, Callable]] = []
callbacks: List[Callable] = []
_custom_logger_compatible_callbacks: list = ["openmeter"]
_langfuse_default_tags: Optional[
List[
Literal[
@ -45,6 +46,7 @@ _async_failure_callback: List[Callable] = (
) # internal variable - async custom callbacks are routed here.
pre_call_rules: List[Callable] = []
post_call_rules: List[Callable] = []
turn_off_message_logging: Optional[bool] = False
## end of callbacks #############
email: Optional[str] = (
@ -58,6 +60,7 @@ max_tokens = 256 # OpenAI Defaults
drop_params = False
modify_params = False
retry = True
### AUTH ###
api_key: Optional[str] = None
openai_key: Optional[str] = None
azure_key: Optional[str] = None
@ -76,7 +79,12 @@ cloudflare_api_key: Optional[str] = None
baseten_key: Optional[str] = None
aleph_alpha_key: Optional[str] = None
nlp_cloud_key: Optional[str] = None
common_cloud_provider_auth_params: dict = {
"params": ["project", "region_name", "token"],
"providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
}
use_client: bool = False
ssl_verify: bool = True
disable_streaming_logging: bool = False
### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None
@ -298,6 +306,7 @@ aleph_alpha_models: List = []
bedrock_models: List = []
deepinfra_models: List = []
perplexity_models: List = []
watsonx_models: List = []
for key, value in model_cost.items():
if value.get("litellm_provider") == "openai":
open_ai_chat_completion_models.append(key)
@ -342,6 +351,8 @@ for key, value in model_cost.items():
deepinfra_models.append(key)
elif value.get("litellm_provider") == "perplexity":
perplexity_models.append(key)
elif value.get("litellm_provider") == "watsonx":
watsonx_models.append(key)
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
openai_compatible_endpoints: List = [
@ -478,6 +489,7 @@ model_list = (
+ perplexity_models
+ maritalk_models
+ vertex_language_models
+ watsonx_models
)
provider_list: List = [
@ -516,6 +528,7 @@ provider_list: List = [
"cloudflare",
"xinference",
"fireworks_ai",
"watsonx",
"custom", # custom apis
]
@ -537,6 +550,7 @@ models_by_provider: dict = {
"deepinfra": deepinfra_models,
"perplexity": perplexity_models,
"maritalk": maritalk_models,
"watsonx": watsonx_models,
}
# mapping for those models which have larger equivalents
@ -647,9 +661,11 @@ from .llms.bedrock import (
AmazonLlamaConfig,
AmazonStabilityConfig,
AmazonMistralConfig,
AmazonBedrockGlobalConfig,
)
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
from .llms.watsonx import IBMWatsonXAIConfig
from .main import * # type: ignore
from .integrations import *
from .exceptions import (

View file

@ -1,7 +1,7 @@
import logging
set_verbose = False
json_logs = False
# Create a handler for the logger (you may need to adapt this based on your needs)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)

View file

@ -12,9 +12,12 @@ import litellm
class LangFuseLogger:
# Class variables or attributes
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
def __init__(
self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
):
try:
from langfuse import Langfuse
import langfuse
except Exception as e:
raise Exception(
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -25,14 +28,20 @@ class LangFuseLogger:
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
self.Langfuse = Langfuse(
public_key=self.public_key,
secret_key=self.secret_key,
host=self.langfuse_host,
release=self.langfuse_release,
debug=self.langfuse_debug,
flush_interval=1, # flush interval in seconds
)
parameters = {
"public_key": self.public_key,
"secret_key": self.secret_key,
"host": self.langfuse_host,
"release": self.langfuse_release,
"debug": self.langfuse_debug,
"flush_interval": flush_interval, # flush interval in seconds
}
if Version(langfuse.version.__version__) >= Version("2.6.0"):
parameters["sdk_integration"] = "litellm"
self.Langfuse = Langfuse(**parameters)
# set the current langfuse project id in the environ
# this is used by Alerting to link to the correct project
@ -77,13 +86,14 @@ class LangFuseLogger:
print_verbose,
level="DEFAULT",
status_message=None,
):
) -> dict:
# Method definition
try:
print_verbose(
f"Langfuse Logging - Enters logging function for model {kwargs}"
)
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
@ -137,8 +147,10 @@ class LangFuseLogger:
input = prompt
output = response_obj["data"]
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
trace_id = None
generation_id = None
if self._is_langfuse_v2():
self._log_langfuse_v2(
trace_id, generation_id = self._log_langfuse_v2(
user_id,
metadata,
litellm_params,
@ -168,10 +180,12 @@ class LangFuseLogger:
f"Langfuse Layer Logging - final response object: {response_obj}"
)
verbose_logger.info(f"Langfuse Layer Logging - logging success")
return {"trace_id": trace_id, "generation_id": generation_id}
except:
traceback.print_exc()
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
pass
return {"trace_id": None, "generation_id": None}
async def _async_log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -243,7 +257,7 @@ class LangFuseLogger:
response_obj,
level,
print_verbose,
):
) -> tuple:
import langfuse
try:
@ -262,15 +276,21 @@ class LangFuseLogger:
tags = metadata_tags
trace_name = metadata.get("trace_name", None)
if trace_name is None:
trace_id = metadata.get("trace_id", None)
existing_trace_id = metadata.get("existing_trace_id", None)
if trace_name is None and existing_trace_id is None:
# just log `litellm-{call_type}` as the trace name
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
if existing_trace_id is not None:
trace_params = {"id": existing_trace_id}
else: # don't overwrite an existing trace
trace_params = {
"name": trace_name,
"input": input,
"user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None),
"id": trace_id,
"session_id": metadata.get("session_id", None),
}
@ -335,6 +355,7 @@ class LangFuseLogger:
kwargs["cache_hit"] = False
tags.append(f"cache_hit:{kwargs['cache_hit']}")
clean_metadata["cache_hit"] = kwargs["cache_hit"]
if existing_trace_id is None:
trace_params.update({"tags": tags})
proxy_server_request = litellm_params.get("proxy_server_request", None)
@ -355,8 +376,6 @@ class LangFuseLogger:
"headers": clean_headers,
}
print_verbose(f"trace_params: {trace_params}")
trace = self.Langfuse.trace(**trace_params)
generation_id = None
@ -373,7 +392,11 @@ class LangFuseLogger:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
if response_obj is not None and "system_fingerprint" in response_obj:
system_fingerprint = response_obj.get("system_fingerprint", None)
else:
system_fingerprint = None
if system_fingerprint is not None:
optional_params["system_fingerprint"] = system_fingerprint
@ -402,8 +425,9 @@ class LangFuseLogger:
"completion_start_time", None
)
print_verbose(f"generation_params: {generation_params}")
generation_client = trace.generation(**generation_params)
trace.generation(**generation_params)
return generation_client.trace_id, generation_id
except Exception as e:
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
return None, None

View file

@ -73,10 +73,6 @@ class LangsmithLogger:
elif type(value) != dict and is_serializable(value=value):
new_kwargs[key] = value
print(f"type of response: {type(response_obj)}")
for k, v in new_kwargs.items():
print(f"key={k}, type of arg: {type(v)}, value={v}")
if isinstance(response_obj, BaseModel):
try:
response_obj = response_obj.model_dump()

View file

@ -0,0 +1,123 @@
# What is this?
## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
import dotenv, os, json
import requests
import litellm
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
import uuid
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
class OpenMeterLogger(CustomLogger):
def __init__(self) -> None:
super().__init__()
self.validate_environment()
self.async_http_handler = AsyncHTTPHandler()
self.sync_http_handler = HTTPHandler()
def validate_environment(self):
"""
Expects
OPENMETER_API_ENDPOINT,
OPENMETER_API_KEY,
in the environment
"""
missing_keys = []
if litellm.get_secret("OPENMETER_API_KEY", None) is None:
missing_keys.append("OPENMETER_API_KEY")
if len(missing_keys) > 0:
raise Exception("Missing keys={} in environment.".format(missing_keys))
def _common_logic(self, kwargs: dict, response_obj):
call_id = response_obj.get("id", kwargs.get("litellm_call_id"))
dt = get_utc_datetime().isoformat()
cost = kwargs.get("response_cost", None)
model = kwargs.get("model")
usage = {}
if (
isinstance(response_obj, litellm.ModelResponse)
or isinstance(response_obj, litellm.EmbeddingResponse)
) and hasattr(response_obj, "usage"):
usage = {
"prompt_tokens": response_obj["usage"].get("prompt_tokens", 0),
"completion_tokens": response_obj["usage"].get("completion_tokens", 0),
"total_tokens": response_obj["usage"].get("total_tokens"),
}
return {
"specversion": "1.0",
"type": os.getenv("OPENMETER_EVENT_TYPE", "litellm_tokens"),
"id": call_id,
"time": dt,
"subject": kwargs.get("user", ""), # end-user passed in via 'user' param
"source": "litellm-proxy",
"data": {"model": model, "cost": cost, **usage},
}
def log_success_event(self, kwargs, response_obj, start_time, end_time):
_url = litellm.get_secret(
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
)
if _url.endswith("/"):
_url += "api/v1/events"
else:
_url += "/api/v1/events"
api_key = litellm.get_secret("OPENMETER_API_KEY")
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
self.sync_http_handler.post(
url=_url,
data=_data,
headers={
"Content-Type": "application/cloudevents+json",
"Authorization": "Bearer {}".format(api_key),
},
)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
_url = litellm.get_secret(
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
)
if _url.endswith("/"):
_url += "api/v1/events"
else:
_url += "/api/v1/events"
api_key = litellm.get_secret("OPENMETER_API_KEY")
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
_headers = {
"Content-Type": "application/cloudevents+json",
"Authorization": "Bearer {}".format(api_key),
}
try:
response = await self.async_http_handler.post(
url=_url,
data=json.dumps(_data),
headers=_headers,
)
response.raise_for_status()
except Exception as e:
print(f"\nAn Exception Occurred - {str(e)}")
if hasattr(response, "text"):
print(f"\nError Message: {response.text}")
raise e

View file

@ -7,11 +7,12 @@ import copy
import traceback
from litellm._logging import verbose_logger, verbose_proxy_logger
import litellm
from typing import List, Literal, Any, Union, Optional
from typing import List, Literal, Any, Union, Optional, Dict
from litellm.caching import DualCache
import asyncio
import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime
class SlackAlerting:
@ -37,12 +38,28 @@ class SlackAlerting:
"budget_alerts",
"db_exceptions",
],
alert_to_webhook_url: Optional[
Dict
] = None, # if user wants to separate alerts to diff channels
):
self.alerting_threshold = alerting_threshold
self.alerting = alerting
self.alert_types = alert_types
self.internal_usage_cache = DualCache()
self.async_http_handler = AsyncHTTPHandler()
self.alert_to_webhook_url = alert_to_webhook_url
self.langfuse_logger = None
try:
from litellm.integrations.langfuse import LangFuseLogger
self.langfuse_logger = LangFuseLogger(
os.getenv("LANGFUSE_PUBLIC_KEY"),
os.getenv("LANGFUSE_SECRET_KEY"),
flush_interval=1,
)
except:
pass
pass
@ -51,6 +68,7 @@ class SlackAlerting:
alerting: Optional[List] = None,
alerting_threshold: Optional[float] = None,
alert_types: Optional[List] = None,
alert_to_webhook_url: Optional[Dict] = None,
):
if alerting is not None:
self.alerting = alerting
@ -59,6 +77,13 @@ class SlackAlerting:
if alert_types is not None:
self.alert_types = alert_types
if alert_to_webhook_url is not None:
# update the dict
if self.alert_to_webhook_url is None:
self.alert_to_webhook_url = alert_to_webhook_url
else:
self.alert_to_webhook_url.update(alert_to_webhook_url)
async def deployment_in_cooldown(self):
pass
@ -81,39 +106,68 @@ class SlackAlerting:
request_info: str,
request_data: Optional[dict] = None,
kwargs: Optional[dict] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request",
start_time: Optional[datetime.datetime] = None,
end_time: Optional[datetime.datetime] = None,
):
import uuid
# For now: do nothing as we're debugging why this is not working as expected
if request_data is not None:
trace_id = request_data.get("metadata", {}).get(
"trace_id", None
) # get langfuse trace id
if trace_id is None:
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
request_data["metadata"]["trace_id"] = trace_id
elif kwargs is not None:
_litellm_params = kwargs.get("litellm_params", {})
trace_id = _litellm_params.get("metadata", {}).get(
"trace_id", None
) # get langfuse trace id
if trace_id is None:
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
_litellm_params["metadata"]["trace_id"] = trace_id
# Log hanging request as an error on langfuse
if type == "hanging_request":
if self.langfuse_logger is not None:
_logging_kwargs = copy.deepcopy(request_data)
if _logging_kwargs is None:
_logging_kwargs = {}
_logging_kwargs["litellm_params"] = {}
request_data = request_data or {}
_logging_kwargs["litellm_params"]["metadata"] = request_data.get(
"metadata", {}
)
# log to langfuse in a separate thread
import threading
threading.Thread(
target=self.langfuse_logger.log_event,
args=(
_logging_kwargs,
None,
start_time,
end_time,
None,
print,
"ERROR",
"Requests is hanging",
),
).start()
_langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
_langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
# langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
_langfuse_url = (
f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
)
request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
return request_info
# if request_data is not None:
# trace_id = request_data.get("metadata", {}).get(
# "trace_id", None
# ) # get langfuse trace id
# if trace_id is None:
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
# request_data["metadata"]["trace_id"] = trace_id
# elif kwargs is not None:
# _litellm_params = kwargs.get("litellm_params", {})
# trace_id = _litellm_params.get("metadata", {}).get(
# "trace_id", None
# ) # get langfuse trace id
# if trace_id is None:
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
# _litellm_params["metadata"]["trace_id"] = trace_id
# _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
# _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
# # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
# _langfuse_url = (
# f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
# )
# request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
# return request_info
def _response_taking_too_long_callback(
self,
kwargs, # kwargs to completion
@ -140,7 +194,6 @@ class SlackAlerting:
raise e
def _get_deployment_latencies_to_alert(self, metadata=None):
if metadata is None:
return None
@ -156,6 +209,14 @@ class SlackAlerting:
_deployment_latencies = metadata["_latency_per_deployment"]
if len(_deployment_latencies) == 0:
return None
try:
# try sorting deployments by latency
_deployment_latencies = sorted(
_deployment_latencies.items(), key=lambda x: x[1]
)
_deployment_latencies = dict(_deployment_latencies)
except:
pass
for api_base, latency in _deployment_latencies.items():
_message_to_send += f"\n{api_base}: {round(latency,2)}s"
_message_to_send = "```" + _message_to_send + "```"
@ -171,8 +232,6 @@ class SlackAlerting:
if self.alerting is None or self.alert_types is None:
return
if "llm_too_slow" not in self.alert_types:
return
time_difference_float, model, api_base, messages = (
self._response_taking_too_long_callback(
kwargs=kwargs,
@ -185,7 +244,7 @@ class SlackAlerting:
if time_difference_float > self.alerting_threshold:
if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info, kwargs=kwargs
request_info=request_info, kwargs=kwargs, type="slow_response"
)
# add deployment latencies to alert
if (
@ -205,6 +264,7 @@ class SlackAlerting:
await self.send_alert(
message=slow_message + request_info,
level="Low",
alert_type="llm_too_slow",
)
async def log_failure_event(self, original_exception: Exception):
@ -212,8 +272,8 @@ class SlackAlerting:
async def response_taking_too_long(
self,
start_time: Optional[float] = None,
end_time: Optional[float] = None,
start_time: Optional[datetime.datetime] = None,
end_time: Optional[datetime.datetime] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request",
request_data: Optional[dict] = None,
):
@ -233,17 +293,10 @@ class SlackAlerting:
except:
messages = ""
request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info, request_data=request_data
)
else:
request_info = ""
if type == "hanging_request":
# Simulate a long-running operation that could take more than 5 minutes
if "llm_requests_hanging" not in self.alert_types:
return
await asyncio.sleep(
self.alerting_threshold
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
@ -281,6 +334,15 @@ class SlackAlerting:
f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
)
if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info,
request_data=request_data,
type="hanging_request",
start_time=start_time,
end_time=end_time,
)
# add deployment latencies to alert
_deployment_latency_map = self._get_deployment_latencies_to_alert(
metadata=request_data.get("metadata", {})
@ -291,6 +353,7 @@ class SlackAlerting:
await self.send_alert(
message=alerting_message + request_info,
level="Medium",
alert_type="llm_requests_hanging",
)
async def budget_alerts(
@ -336,8 +399,7 @@ class SlackAlerting:
user_info = f"\nUser ID: {user_id}\n Error {error_message}"
message = "Failed Tracking Cost for" + user_info
await self.send_alert(
message=message,
level="High",
message=message, level="High", alert_type="budget_alerts"
)
return
elif type == "projected_limit_exceeded" and user_info is not None:
@ -353,8 +415,7 @@ class SlackAlerting:
"""
message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
await self.send_alert(
message=message,
level="High",
message=message, level="High", alert_type="budget_alerts"
)
return
else:
@ -382,8 +443,7 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=message)
if result is None:
await self.send_alert(
message=message,
level="High",
message=message, level="High", alert_type="budget_alerts"
)
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
return
@ -395,8 +455,7 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=cache_key)
if result is None:
await self.send_alert(
message=message,
level="Medium",
message=message, level="Medium", alert_type="budget_alerts"
)
await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
@ -409,15 +468,25 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=message)
if result is None:
await self.send_alert(
message=message,
level="Low",
message=message, level="Low", alert_type="budget_alerts"
)
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
return
return
async def send_alert(self, message: str, level: Literal["Low", "Medium", "High"]):
async def send_alert(
self,
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
],
):
"""
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -432,12 +501,6 @@ class SlackAlerting:
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
message: str - what is the alert about
"""
print(
"inside send alert for slack, message: ",
message,
"self.alerting: ",
self.alerting,
)
if self.alerting is None:
return
@ -453,7 +516,15 @@ class SlackAlerting:
if _proxy_base_url is not None:
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
# check if we find the slack webhook url in self.alert_to_webhook_url
if (
self.alert_to_webhook_url is not None
and alert_type in self.alert_to_webhook_url
):
slack_webhook_url = self.alert_to_webhook_url[alert_type]
else:
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
if slack_webhook_url is None:
raise Exception("Missing SLACK_WEBHOOK_URL from environment")
payload = {"text": formatted_message}

View file

@ -96,6 +96,15 @@ class AzureOpenAIConfig(OpenAIConfig):
top_p,
)
def get_mapped_special_auth_params(self) -> dict:
return {"token": "azure_ad_token"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "token":
optional_params["azure_ad_token"] = value
return optional_params
def select_azure_base_url_or_endpoint(azure_client_params: dict):
# azure_client_params = {

View file

@ -29,6 +29,24 @@ class BedrockError(Exception):
) # Call the base class constructor with the parameters it needs
class AmazonBedrockGlobalConfig:
def __init__(self):
pass
def get_mapped_special_auth_params(self) -> dict:
"""
Mapping of common auth params across bedrock/vertex/azure/watsonx
"""
return {"region_name": "aws_region_name"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
class AmazonTitanConfig:
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
@ -666,6 +684,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
elif provider == "meta":
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
else:
prompt = ""
for message in messages:
@ -945,7 +967,7 @@ def completion(
original_response=json.dumps(response_body),
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response}")
print_verbose(f"raw model_response: {response_body}")
## RESPONSE OBJECT
outputText = "default"
if provider == "ai21":
@ -1058,6 +1080,7 @@ def completion(
outputText = response_body.get("results")[0].get("outputText")
response_metadata = response.get("ResponseMetadata", {})
if response_metadata.get("HTTPStatusCode", 500) >= 400:
raise BedrockError(
message=outputText,
@ -1093,11 +1116,13 @@ def completion(
prompt_tokens = response_metadata.get(
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
)
_text_response = model_response["choices"][0]["message"].get("content", "")
completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count",
len(
encoding.encode(
model_response["choices"][0]["message"].get("content", "")
_text_response,
disallowed_special=(),
)
),
)

View file

@ -213,12 +213,13 @@ def get_ollama_response(
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
if optional_params.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message(
content=None,
tool_calls=[
{
"id": f"call_{str(uuid.uuid4())}",
"function": {"arguments": response_json["response"], "name": ""},
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"type": "function",
}
],
@ -310,15 +311,13 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message(
content=None,
tool_calls=[
{
"id": f"call_{str(uuid.uuid4())}",
"function": {
"arguments": response_json["response"],
"name": "",
},
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"type": "function",
}
],

View file

@ -285,15 +285,13 @@ def get_ollama_response(
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message(
content=None,
tool_calls=[
{
"id": f"call_{str(uuid.uuid4())}",
"function": {
"arguments": response_json["message"]["content"],
"name": "",
},
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"type": "function",
}
],
@ -415,15 +413,13 @@ async def ollama_acompletion(
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message(
content=None,
tool_calls=[
{
"id": f"call_{str(uuid.uuid4())}",
"function": {
"arguments": response_json["message"]["content"],
"name": function_name or "",
},
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"type": "function",
}
],

View file

@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
)
else:
openai_aclient = client
## LOGGING
logging_obj.pre_call(
input=data["messages"],

View file

@ -3,8 +3,14 @@ import requests, traceback
import json, re, xml.etree.ElementTree as ET
from jinja2 import Template, exceptions, meta, BaseLoader
from jinja2.sandbox import ImmutableSandboxedEnvironment
from typing import Optional, Any
from typing import List
from typing import (
Any,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
)
import litellm
@ -431,6 +437,35 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
return prompt
### IBM Granite
def ibm_granite_pt(messages: list):
"""
IBM's Granite models uses the template:
<|system|> {system_message} <|user|> {user_message} <|assistant|> {assistant_message}
See: https://www.ibm.com/docs/en/watsonx-as-a-service?topic=solutions-supported-foundation-models
"""
return custom_prompt(
messages=messages,
role_dict={
"system": {
"pre_message": "<|system|>\n",
"post_message": "\n",
},
"user": {
"pre_message": "<|user|>\n",
"post_message": "\n",
},
"assistant": {
"pre_message": "<|assistant|>\n",
"post_message": "\n",
},
},
).strip()
### ANTHROPIC ###
@ -1017,6 +1052,30 @@ def get_system_prompt(messages):
return system_prompt, messages
def convert_to_documents(
observations: Any,
) -> List[MutableMapping]:
"""Converts observations into a 'document' dict"""
documents: List[MutableMapping] = []
if isinstance(observations, str):
# strings are turned into a key/value pair and a key of 'output' is added.
observations = [{"output": observations}]
elif isinstance(observations, Mapping):
# single mappings are transformed into a list to simplify the rest of the code.
observations = [observations]
elif not isinstance(observations, Sequence):
# all other types are turned into a key/value pair within a list
observations = [{"output": observations}]
for doc in observations:
if not isinstance(doc, Mapping):
# types that aren't Mapping are turned into a key/value pair.
doc = {"output": doc}
documents.append(doc)
return documents
def convert_openai_message_to_cohere_tool_result(message):
"""
OpenAI message with a tool result looks like:
@ -1058,7 +1117,7 @@ def convert_openai_message_to_cohere_tool_result(message):
"parameters": {"location": "San Francisco, CA"},
"generation_id": tool_call_id,
},
"outputs": [content],
"outputs": convert_to_documents(content),
}
return cohere_tool_result
@ -1071,7 +1130,7 @@ def cohere_message_pt(messages: list):
if message["role"] == "tool":
tool_result = convert_openai_message_to_cohere_tool_result(message)
tool_results.append(tool_result)
else:
elif message.get("content"):
prompt += message["content"] + "\n\n"
prompt = prompt.rstrip()
return prompt, tool_results
@ -1346,12 +1405,47 @@ def prompt_factory(
return anthropic_pt(messages=messages)
elif "mistral." in model:
return mistral_instruct_pt(messages=messages)
elif "llama2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)
elif "llama3" in model and "instruct" in model:
return hf_chat_template(
model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=messages,
)
elif custom_llm_provider == "perplexity":
for message in messages:
message.pop("name", None)
return messages
elif custom_llm_provider == "azure_text":
return azure_text_pt(messages=messages)
elif custom_llm_provider == "watsonx":
if "granite" in model and "chat" in model:
# granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
return ibm_granite_pt(messages=messages)
elif "ibm-mistral" in model and "instruct" in model:
# models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
return mistral_instruct_pt(messages=messages)
elif "meta-llama/llama-3" in model and "instruct" in model:
# https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
return custom_prompt(
role_dict={
"system": {
"pre_message": "<|start_header_id|>system<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
"user": {
"pre_message": "<|start_header_id|>user<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
"assistant": {
"pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
},
messages=messages,
initial_prompt_value="<|begin_of_text|>",
final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
)
try:
if "meta-llama/llama-2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)
@ -1359,11 +1453,8 @@ def prompt_factory(
"meta-llama/llama-3" in model or "meta-llama-3" in model
) and "instruct" in model:
return hf_chat_template(
model=model,
model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=messages,
chat_template=known_tokenizer_config[ # type: ignore
"meta-llama/Meta-Llama-3-8B-Instruct"
]["tokenizer"]["chat_template"],
)
elif (
"tiiuae/falcon" in model

View file

@ -112,10 +112,16 @@ def start_prediction(
}
initial_prediction_data = {
"version": version_id,
"input": input_data,
}
if ":" in version_id and len(version_id) > 64:
model_parts = version_id.split(":")
if (
len(model_parts) > 1 and len(model_parts[1]) == 64
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
initial_prediction_data["version"] = model_parts[1]
## LOGGING
logging_obj.pre_call(
input=input_data["prompt"],

View file

@ -143,7 +143,9 @@ class VertexAIConfig:
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
if param == "stream":
if (
param == "stream" and value == True
): # sending stream = False, can cause it to get passed unchecked and raise issues
optional_params["stream"] = value
if param == "n":
optional_params["candidate_count"] = value
@ -182,6 +184,20 @@ class VertexAIConfig:
pass
return optional_params
def get_mapped_special_auth_params(self) -> dict:
"""
Common auth params across bedrock/vertex_ai/azure/watsonx
"""
return {"project": "vertex_project", "region_name": "vertex_location"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
import asyncio
@ -527,6 +543,7 @@ def completion(
"instances": instances,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
"safety_settings": safety_settings,
**optional_params,
}
if optional_params.get("stream", False) is True:
@ -541,8 +558,9 @@ def completion(
tools = optional_params.pop("tools", None)
prompt, images = _gemini_vision_convert_messages(messages=messages)
content = [prompt] + images
if "stream" in optional_params and optional_params["stream"] == True:
stream = optional_params.pop("stream")
stream = optional_params.pop("stream", False)
if stream == True:
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
logging_obj.pre_call(
input=prompt,
@ -810,6 +828,7 @@ async def async_completion(
instances=None,
vertex_project=None,
vertex_location=None,
safety_settings=None,
**optional_params,
):
"""
@ -820,6 +839,7 @@ async def async_completion(
print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
print_verbose(f"\nProcessing input messages = {messages}")
tools = optional_params.pop("tools", None)
stream = optional_params.pop("stream", False)
prompt, images = _gemini_vision_convert_messages(messages=messages)
content = [prompt] + images
@ -840,6 +860,7 @@ async def async_completion(
response = await llm_model._generate_content_async(
contents=content,
generation_config=optional_params,
safety_settings=safety_settings,
tools=tools,
)
@ -1018,6 +1039,7 @@ async def async_streaming(
instances=None,
vertex_project=None,
vertex_location=None,
safety_settings=None,
**optional_params,
):
"""
@ -1044,6 +1066,7 @@ async def async_streaming(
response = await llm_model._generate_content_streaming_async(
contents=content,
generation_config=optional_params,
safety_settings=safety_settings,
tools=tools,
)

609
litellm/llms/watsonx.py Normal file
View file

@ -0,0 +1,609 @@
from enum import Enum
import json, types, time # noqa: E401
from contextlib import contextmanager
from typing import Callable, Dict, Optional, Any, Union, List
import httpx
import requests
import litellm
from litellm.utils import ModelResponse, get_secret, Usage
from .base import BaseLLM
from .prompt_templates import factory as ptf
class WatsonXAIError(Exception):
def __init__(self, status_code, message, url: Optional[str] = None):
self.status_code = status_code
self.message = message
url = url or "https://https://us-south.ml.cloud.ibm.com"
self.request = httpx.Request(method="POST", url=url)
self.response = httpx.Response(status_code=status_code, request=self.request)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class IBMWatsonXAIConfig:
"""
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
(See ibm_watsonx_ai.metanames.GenTextParamsMetaNames for a list of all available params)
Supported params for all available watsonx.ai foundational models.
- `decoding_method` (str): One of "greedy" or "sample"
- `temperature` (float): Sets the model temperature for sampling - not available when decoding_method='greedy'.
- `max_new_tokens` (integer): Maximum length of the generated tokens.
- `min_new_tokens` (integer): Maximum length of input tokens. Any more than this will be truncated.
- `length_penalty` (dict): A dictionary with keys "decay_factor" and "start_index".
- `stop_sequences` (string[]): list of strings to use as stop sequences.
- `top_k` (integer): top k for sampling - not available when decoding_method='greedy'.
- `top_p` (integer): top p for sampling - not available when decoding_method='greedy'.
- `repetition_penalty` (float): token repetition penalty during text generation.
- `truncate_input_tokens` (integer): Truncate input tokens to this length.
- `include_stop_sequences` (bool): If True, the stop sequence will be included at the end of the generated text in the case of a match.
- `return_options` (dict): A dictionary of options to return. Options include "input_text", "generated_tokens", "input_tokens", "token_ranks". Values are boolean.
- `random_seed` (integer): Random seed for text generation.
- `moderations` (dict): Dictionary of properties that control the moderations, for usages such as Hate and profanity (HAP) and PII filtering.
- `stream` (bool): If True, the model will return a stream of responses.
"""
decoding_method: Optional[str] = "sample"
temperature: Optional[float] = None
max_new_tokens: Optional[int] = None # litellm.max_tokens
min_new_tokens: Optional[int] = None
length_penalty: Optional[dict] = None # e.g {"decay_factor": 2.5, "start_index": 5}
stop_sequences: Optional[List[str]] = None # e.g ["}", ")", "."]
top_k: Optional[int] = None
top_p: Optional[float] = None
repetition_penalty: Optional[float] = None
truncate_input_tokens: Optional[int] = None
include_stop_sequences: Optional[bool] = False
return_options: Optional[Dict[str, bool]] = None
random_seed: Optional[int] = None # e.g 42
moderations: Optional[dict] = None
stream: Optional[bool] = False
def __init__(
self,
decoding_method: Optional[str] = None,
temperature: Optional[float] = None,
max_new_tokens: Optional[int] = None,
min_new_tokens: Optional[int] = None,
length_penalty: Optional[dict] = None,
stop_sequences: Optional[List[str]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
repetition_penalty: Optional[float] = None,
truncate_input_tokens: Optional[int] = None,
include_stop_sequences: Optional[bool] = None,
return_options: Optional[dict] = None,
random_seed: Optional[int] = None,
moderations: Optional[dict] = None,
stream: Optional[bool] = None,
**kwargs,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(self):
return [
"temperature", # equivalent to temperature
"max_tokens", # equivalent to max_new_tokens
"top_p", # equivalent to top_p
"frequency_penalty", # equivalent to repetition_penalty
"stop", # equivalent to stop_sequences
"seed", # equivalent to random_seed
"stream", # equivalent to stream
]
def get_mapped_special_auth_params(self) -> dict:
"""
Common auth params across bedrock/vertex_ai/azure/watsonx
"""
return {
"project": "watsonx_project",
"region_name": "watsonx_region_name",
"token": "watsonx_token",
}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
# handle anthropic prompts and amazon titan prompts
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_dict = custom_prompt_dict[model]
prompt = ptf.custom_prompt(
messages=messages,
role_dict=model_prompt_dict.get(
"role_dict", model_prompt_dict.get("roles")
),
initial_prompt_value=model_prompt_dict.get("initial_prompt_value", ""),
final_prompt_value=model_prompt_dict.get("final_prompt_value", ""),
bos_token=model_prompt_dict.get("bos_token", ""),
eos_token=model_prompt_dict.get("eos_token", ""),
)
return prompt
elif provider == "ibm":
prompt = ptf.prompt_factory(
model=model, messages=messages, custom_llm_provider="watsonx"
)
elif provider == "ibm-mistralai":
prompt = ptf.mistral_instruct_pt(messages=messages)
else:
prompt = ptf.prompt_factory(
model=model, messages=messages, custom_llm_provider="watsonx"
)
return prompt
class WatsonXAIEndpoint(str, Enum):
TEXT_GENERATION = "/ml/v1/text/generation"
TEXT_GENERATION_STREAM = "/ml/v1/text/generation_stream"
DEPLOYMENT_TEXT_GENERATION = "/ml/v1/deployments/{deployment_id}/text/generation"
DEPLOYMENT_TEXT_GENERATION_STREAM = (
"/ml/v1/deployments/{deployment_id}/text/generation_stream"
)
EMBEDDINGS = "/ml/v1/text/embeddings"
PROMPTS = "/ml/v1/prompts"
class IBMWatsonXAI(BaseLLM):
"""
Class to interface with IBM Watsonx.ai API for text generation and embeddings.
Reference: https://cloud.ibm.com/apidocs/watsonx-ai
"""
api_version = "2024-03-13"
def __init__(self) -> None:
super().__init__()
def _prepare_text_generation_req(
self,
model_id: str,
prompt: str,
stream: bool,
optional_params: dict,
print_verbose: Optional[Callable] = None,
) -> dict:
"""
Get the request parameters for text generation.
"""
api_params = self._get_api_params(optional_params, print_verbose=print_verbose)
# build auth headers
api_token = api_params.get("token")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
extra_body_params = optional_params.pop("extra_body", {})
optional_params.update(extra_body_params)
# init the payload to the text generation call
payload = {
"input": prompt,
"moderations": optional_params.pop("moderations", {}),
"parameters": optional_params,
}
request_params = dict(version=api_params["api_version"])
# text generation endpoint deployment or model / stream or not
if model_id.startswith("deployment/"):
# deployment models are passed in as 'deployment/<deployment_id>'
if api_params.get("space_id") is None:
raise WatsonXAIError(
status_code=401,
url=api_params["url"],
message="Error: space_id is required for models called using the 'deployment/' endpoint. Pass in the space_id as a parameter or set it in the WX_SPACE_ID environment variable.",
)
deployment_id = "/".join(model_id.split("/")[1:])
endpoint = (
WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION_STREAM.value
if stream
else WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION.value
)
endpoint = endpoint.format(deployment_id=deployment_id)
else:
payload["model_id"] = model_id
payload["project_id"] = api_params["project_id"]
endpoint = (
WatsonXAIEndpoint.TEXT_GENERATION_STREAM
if stream
else WatsonXAIEndpoint.TEXT_GENERATION
)
url = api_params["url"].rstrip("/") + endpoint
return dict(
method="POST", url=url, headers=headers, json=payload, params=request_params
)
def _get_api_params(
self, params: dict, print_verbose: Optional[Callable] = None
) -> dict:
"""
Find watsonx.ai credentials in the params or environment variables and return the headers for authentication.
"""
# Load auth variables from params
url = params.pop("url", params.pop("api_base", params.pop("base_url", None)))
api_key = params.pop("apikey", None)
token = params.pop("token", None)
project_id = params.pop(
"project_id", params.pop("watsonx_project", None)
) # watsonx.ai project_id - allow 'watsonx_project' to be consistent with how vertex project implementation works -> reduce provider-specific params
space_id = params.pop("space_id", None) # watsonx.ai deployment space_id
region_name = params.pop("region_name", params.pop("region", None))
if region_name is None:
region_name = params.pop(
"watsonx_region_name", params.pop("watsonx_region", None)
) # consistent with how vertex ai + aws regions are accepted
wx_credentials = params.pop(
"wx_credentials",
params.pop(
"watsonx_credentials", None
), # follow {provider}_credentials, same as vertex ai
)
api_version = params.pop("api_version", IBMWatsonXAI.api_version)
# Load auth variables from environment variables
if url is None:
url = (
get_secret("WATSONX_API_BASE") # consistent with 'AZURE_API_BASE'
or get_secret("WATSONX_URL")
or get_secret("WX_URL")
or get_secret("WML_URL")
)
if api_key is None:
api_key = (
get_secret("WATSONX_APIKEY")
or get_secret("WATSONX_API_KEY")
or get_secret("WX_API_KEY")
)
if token is None:
token = get_secret("WATSONX_TOKEN") or get_secret("WX_TOKEN")
if project_id is None:
project_id = (
get_secret("WATSONX_PROJECT_ID")
or get_secret("WX_PROJECT_ID")
or get_secret("PROJECT_ID")
)
if region_name is None:
region_name = (
get_secret("WATSONX_REGION")
or get_secret("WX_REGION")
or get_secret("REGION")
)
if space_id is None:
space_id = (
get_secret("WATSONX_DEPLOYMENT_SPACE_ID")
or get_secret("WATSONX_SPACE_ID")
or get_secret("WX_SPACE_ID")
or get_secret("SPACE_ID")
)
# credentials parsing
if wx_credentials is not None:
url = wx_credentials.get("url", url)
api_key = wx_credentials.get(
"apikey", wx_credentials.get("api_key", api_key)
)
token = wx_credentials.get(
"token",
wx_credentials.get(
"watsonx_token", token
), # follow format of {provider}_token, same as azure - e.g. 'azure_ad_token=..'
)
# verify that all required credentials are present
if url is None:
raise WatsonXAIError(
status_code=401,
message="Error: Watsonx URL not set. Set WX_URL in environment variables or pass in as a parameter.",
)
if token is None and api_key is not None:
# generate the auth token
if print_verbose:
print_verbose("Generating IAM token for Watsonx.ai")
token = self.generate_iam_token(api_key)
elif token is None and api_key is None:
raise WatsonXAIError(
status_code=401,
url=url,
message="Error: API key or token not found. Set WX_API_KEY or WX_TOKEN in environment variables or pass in as a parameter.",
)
if project_id is None:
raise WatsonXAIError(
status_code=401,
url=url,
message="Error: Watsonx project_id not set. Set WX_PROJECT_ID in environment variables or pass in as a parameter.",
)
return {
"url": url,
"api_key": api_key,
"token": token,
"project_id": project_id,
"space_id": space_id,
"region_name": region_name,
"api_version": api_version,
}
def completion(
self,
model: str,
messages: list,
custom_prompt_dict: dict,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
logging_obj,
optional_params: dict,
litellm_params: Optional[dict] = None,
logger_fn=None,
timeout: Optional[float] = None,
):
"""
Send a text generation request to the IBM Watsonx.ai API.
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
"""
stream = optional_params.pop("stream", False)
# Load default configs
config = IBMWatsonXAIConfig.get_config()
for k, v in config.items():
if k not in optional_params:
optional_params[k] = v
# Make prompt to send to model
provider = model.split("/")[0]
# model_name = "/".join(model.split("/")[1:])
prompt = convert_messages_to_prompt(
model, messages, provider, custom_prompt_dict
)
def process_text_request(request_params: dict) -> ModelResponse:
with self._manage_response(
request_params, logging_obj=logging_obj, input=prompt, timeout=timeout
) as resp:
json_resp = resp.json()
generated_text = json_resp["results"][0]["generated_text"]
prompt_tokens = json_resp["results"][0]["input_token_count"]
completion_tokens = json_resp["results"][0]["generated_token_count"]
model_response["choices"][0]["message"]["content"] = generated_text
model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
model_response["created"] = int(time.time())
model_response["model"] = model
setattr(
model_response,
"usage",
Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
def process_stream_request(
request_params: dict,
) -> litellm.CustomStreamWrapper:
# stream the response - generated chunks will be handled
# by litellm.utils.CustomStreamWrapper.handle_watsonx_stream
with self._manage_response(
request_params,
logging_obj=logging_obj,
stream=True,
input=prompt,
timeout=timeout,
) as resp:
response = litellm.CustomStreamWrapper(
resp.iter_lines(),
model=model,
custom_llm_provider="watsonx",
logging_obj=logging_obj,
)
return response
try:
## Get the response from the model
req_params = self._prepare_text_generation_req(
model_id=model,
prompt=prompt,
stream=stream,
optional_params=optional_params,
print_verbose=print_verbose,
)
if stream:
return process_stream_request(req_params)
else:
return process_text_request(req_params)
except WatsonXAIError as e:
raise e
except Exception as e:
raise WatsonXAIError(status_code=500, message=str(e))
def embedding(
self,
model: str,
input: Union[list, str],
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
):
"""
Send a text embedding request to the IBM Watsonx.ai API.
"""
if optional_params is None:
optional_params = {}
# Load default configs
config = IBMWatsonXAIConfig.get_config()
for k, v in config.items():
if k not in optional_params:
optional_params[k] = v
# Load auth variables from environment variables
if isinstance(input, str):
input = [input]
if api_key is not None:
optional_params["api_key"] = api_key
api_params = self._get_api_params(optional_params)
# build auth headers
api_token = api_params.get("token")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
# init the payload to the text generation call
payload = {
"inputs": input,
"model_id": model,
"project_id": api_params["project_id"],
"parameters": optional_params,
}
request_params = dict(version=api_params["api_version"])
url = api_params["url"].rstrip("/") + WatsonXAIEndpoint.EMBEDDINGS
# request = httpx.Request(
# "POST", url, headers=headers, json=payload, params=request_params
# )
req_params = {
"method": "POST",
"url": url,
"headers": headers,
"json": payload,
"params": request_params,
}
with self._manage_response(
req_params, logging_obj=logging_obj, input=input
) as resp:
json_resp = resp.json()
results = json_resp.get("results", [])
embedding_response = []
for idx, result in enumerate(results):
embedding_response.append(
{"object": "embedding", "index": idx, "embedding": result["embedding"]}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = json_resp.get("input_token_count", 0)
model_response.usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
return model_response
def generate_iam_token(self, api_key=None, **params):
headers = {}
headers["Content-Type"] = "application/x-www-form-urlencoded"
if api_key is None:
api_key = get_secret("WX_API_KEY") or get_secret("WATSONX_API_KEY")
if api_key is None:
raise ValueError("API key is required")
headers["Accept"] = "application/json"
data = {
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
"apikey": api_key,
}
response = httpx.post(
"https://iam.cloud.ibm.com/identity/token", data=data, headers=headers
)
response.raise_for_status()
json_data = response.json()
iam_access_token = json_data["access_token"]
self.token = iam_access_token
return iam_access_token
@contextmanager
def _manage_response(
self,
request_params: dict,
logging_obj: Any,
stream: bool = False,
input: Optional[Any] = None,
timeout: Optional[float] = None,
):
request_str = (
f"response = {request_params['method']}(\n"
f"\turl={request_params['url']},\n"
f"\tjson={request_params['json']},\n"
f")"
)
logging_obj.pre_call(
input=input,
api_key=request_params["headers"].get("Authorization"),
additional_args={
"complete_input_dict": request_params["json"],
"request_str": request_str,
},
)
if timeout:
request_params["timeout"] = timeout
try:
if stream:
resp = requests.request(
**request_params,
stream=True,
)
resp.raise_for_status()
yield resp
else:
resp = requests.request(**request_params)
resp.raise_for_status()
yield resp
except Exception as e:
raise WatsonXAIError(status_code=500, message=str(e))
if not stream:
logging_obj.post_call(
input=input,
api_key=request_params["headers"].get("Authorization"),
original_response=json.dumps(resp.json()),
additional_args={
"status_code": resp.status_code,
"complete_input_dict": request_params["json"],
},
)

View file

@ -63,6 +63,7 @@ from .llms import (
vertex_ai,
vertex_ai_anthropic,
maritalk,
watsonx,
)
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
from .llms.azure import AzureChatCompletion
@ -360,7 +361,7 @@ def mock_completion(
model: str,
messages: List,
stream: Optional[bool] = False,
mock_response: str = "This is a mock request",
mock_response: Union[str, Exception] = "This is a mock request",
logging=None,
**kwargs,
):
@ -387,6 +388,20 @@ def mock_completion(
- If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
"""
try:
## LOGGING
if logging is not None:
logging.pre_call(
input=messages,
api_key="mock-key",
)
if isinstance(mock_response, Exception):
raise litellm.APIError(
status_code=500, # type: ignore
message=str(mock_response),
llm_provider="openai", # type: ignore
model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
)
model_response = ModelResponse(stream=stream)
if stream is True:
# don't try to access stream object,
@ -1864,6 +1879,43 @@ def completion(
## RESPONSE OBJECT
response = response
elif custom_llm_provider == "watsonx":
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
response = watsonx.IBMWatsonXAI().completion(
model=model,
messages=messages,
custom_prompt_dict=custom_prompt_dict,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params, # type: ignore
logger_fn=logger_fn,
encoding=encoding,
logging_obj=logging,
timeout=timeout,
)
if (
"stream" in optional_params
and optional_params["stream"] == True
and not isinstance(response, CustomStreamWrapper)
):
# don't try to access stream object,
response = CustomStreamWrapper(
iter(response),
model,
custom_llm_provider="watsonx",
logging_obj=logging,
)
if optional_params.get("stream", False):
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
## RESPONSE OBJECT
response = response
elif custom_llm_provider == "vllm":
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
model_response = vllm.completion(
@ -2943,6 +2995,15 @@ def embedding(
client=client,
aembedding=aembedding,
)
elif custom_llm_provider == "watsonx":
response = watsonx.IBMWatsonXAI().embedding(
model=model,
input=input,
encoding=encoding,
logging_obj=logging,
optional_params=optional_params,
model_response=EmbeddingResponse(),
)
else:
args = locals()
raise ValueError(f"No valid embedding model args passed in - {args}")

View file

@ -1418,6 +1418,123 @@
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-instruct-v0.2": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.000001,
"litellm_provider": "replicate",
"mode": "chat"
},
"openrouter/openai/gpt-3.5-turbo": {
"max_tokens": 4095,
"input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
"litellm_provider": "openrouter",
"mode": "chat"
},
"openrouter/anthropic/claude-3-opus": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"litellm_provider": "openrouter",
"mode": "chat",
"supports_function_calling": true,
"tool_use_system_prompt_tokens": 395
},
"openrouter/google/palm-2-chat-bison": {
"max_tokens": 8000,
"input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-8b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-70b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77,
"max_input_tokens": 77,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);

View file

@ -1 +0,0 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);

View file

@ -1 +0,0 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);

View file

@ -1 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1,5 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82332,[\"127\",\"static/chunks/127-efd0436630e294eb.js\",\"931\",\"static/chunks/app/page-525d83925fd5350b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Csz8BqWx6JEoKsgLqCeCt\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
<<<<<<< HEAD
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"761\",\"static/chunks/761-05f8a8451296476c.js\",\"931\",\"static/chunks/app/page-5a4a198eefedc775.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"c5rha8cqAah-saaczjn02\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
=======
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"386\",\"static/chunks/386-d811195b597a2122.js\",\"931\",\"static/chunks/app/page-e0ee34389254cdf2.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"dWGL92c5LzTMn7XX6utn2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)

View file

@ -1,7 +1,14 @@
2:I[77831,[],""]
3:I[82332,["127","static/chunks/127-efd0436630e294eb.js","931","static/chunks/app/page-525d83925fd5350b.js"],""]
<<<<<<< HEAD
3:I[46414,["761","static/chunks/761-05f8a8451296476c.js","931","static/chunks/app/page-5a4a198eefedc775.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["Csz8BqWx6JEoKsgLqCeCt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/60d9f441227ccc7e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["c5rha8cqAah-saaczjn02",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
=======
3:I[46414,["386","static/chunks/386-d811195b597a2122.js","931","static/chunks/app/page-e0ee34389254cdf2.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["dWGL92c5LzTMn7XX6utn2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/9f51f0573c6b0365.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,51 +1,15 @@
environment_variables:
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
general_settings:
alerting:
- slack
alerting_threshold: 300
database_connection_pool_limit: 100
database_connection_timeout: 60
health_check_interval: 300
proxy_batch_write_at: 10
ui_access_mode: all
litellm_settings:
allowed_fails: 3
failure_callback:
- prometheus
fallbacks:
- gpt-3.5-turbo:
- fake-openai-endpoint
- gpt-4
num_retries: 3
service_callback:
- prometheus_system
success_callback:
- prometheus
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
- litellm_params:
model: gpt-3.5-turbo
model_name: gpt-3.5-turbo
- model_name: llama-3
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
router_settings:
allowed_fails: 3
context_window_fallbacks: null
cooldown_time: 1
fallbacks:
- gpt-3.5-turbo:
- fake-openai-endpoint
- gpt-4
- gpt-3.5-turbo-3:
- fake-openai-endpoint
num_retries: 3
retry_after: 0
routing_strategy: simple-shuffle
routing_strategy_args: {}
timeout: 6000
num_retries: 0
enable_pre_call_checks: true
redis_host: os.environ/REDIS_HOST
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
litellm_settings:
success_callback: ["openmeter"]

View file

@ -422,6 +422,9 @@ class LiteLLM_ModelTable(LiteLLMBase):
created_by: str
updated_by: str
class Config:
protected_namespaces = ()
class NewUserRequest(GenerateKeyRequest):
max_budget: Optional[float] = None
@ -485,6 +488,9 @@ class TeamBase(LiteLLMBase):
class NewTeamRequest(TeamBase):
model_aliases: Optional[dict] = None
class Config:
protected_namespaces = ()
class GlobalEndUsersSpend(LiteLLMBase):
api_key: Optional[str] = None
@ -534,6 +540,9 @@ class LiteLLM_TeamTable(TeamBase):
budget_reset_at: Optional[datetime] = None
model_id: Optional[int] = None
class Config:
protected_namespaces = ()
@root_validator(pre=True)
def set_model_info(cls, values):
dict_fields = [
@ -570,6 +579,9 @@ class LiteLLM_BudgetTable(LiteLLMBase):
model_max_budget: Optional[dict] = None
budget_duration: Optional[str] = None
class Config:
protected_namespaces = ()
class NewOrganizationRequest(LiteLLM_BudgetTable):
organization_id: Optional[str] = None
@ -720,6 +732,10 @@ class ConfigGeneralSettings(LiteLLMBase):
None,
description="List of alerting types. By default it is all alerts",
)
alert_to_webhook_url: Optional[Dict] = Field(
None,
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
)
alerting_threshold: Optional[int] = Field(
None,
@ -896,5 +912,19 @@ class LiteLLM_SpendLogs(LiteLLMBase):
request_tags: Optional[Json] = None
class LiteLLM_ErrorLogs(LiteLLMBase):
request_id: Optional[str] = str(uuid.uuid4())
api_base: Optional[str] = ""
model_group: Optional[str] = ""
litellm_model_name: Optional[str] = ""
model_id: Optional[str] = ""
request_kwargs: Optional[dict] = {}
exception_type: Optional[str] = ""
status_code: Optional[str] = ""
exception_string: Optional[str] = ""
startTime: Union[str, datetime, None]
endTime: Union[str, datetime, None]
class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None

View file

@ -95,7 +95,15 @@ def common_checks(
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
)
# 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
if litellm.max_budget > 0 and global_proxy_spend is not None:
if (
litellm.max_budget > 0
and global_proxy_spend is not None
# only run global budget checks for OpenAI routes
# Reason - the Admin UI should continue working if the proxy crosses it's global budget
and route in LiteLLMRoutes.openai_routes.value
and route != "/v1/models"
and route != "/models"
):
if global_proxy_spend > litellm.max_budget:
raise Exception(
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"

View file

@ -1059,8 +1059,18 @@ async def user_api_key_auth(
):
pass
else:
user_role = "unknown"
user_id = "unknown"
if user_id_information is not None and isinstance(
user_id_information, list
):
_user = user_id_information[0]
user_role = _user.get("user_role", {}).get(
"user_role", "unknown"
)
user_id = _user.get("user_id", "unknown")
raise Exception(
f"Only master key can be used to generate, delete, update info for new keys/users/teams. Route={route}"
f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
)
# check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
@ -1207,6 +1217,68 @@ def cost_tracking():
litellm.success_callback.append(_PROXY_track_cost_callback) # type: ignore
async def _PROXY_failure_handler(
kwargs, # kwargs to completion
completion_response: litellm.ModelResponse, # response from completion
start_time=None,
end_time=None, # start/end time for completion
):
global prisma_client
if prisma_client is not None:
verbose_proxy_logger.debug(
"inside _PROXY_failure_handler kwargs=", extra=kwargs
)
_exception = kwargs.get("exception")
_exception_type = _exception.__class__.__name__
_model = kwargs.get("model", None)
_optional_params = kwargs.get("optional_params", {})
_optional_params = copy.deepcopy(_optional_params)
for k, v in _optional_params.items():
v = str(v)
v = v[:100]
_status_code = "500"
try:
_status_code = str(_exception.status_code)
except:
# Don't let this fail logging the exception to the dB
pass
_litellm_params = kwargs.get("litellm_params", {}) or {}
_metadata = _litellm_params.get("metadata", {}) or {}
_model_id = _metadata.get("model_info", {}).get("id", "")
_model_group = _metadata.get("model_group", "")
api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
_exception_string = str(_exception)[:500]
error_log = LiteLLM_ErrorLogs(
request_id=str(uuid.uuid4()),
model_group=_model_group,
model_id=_model_id,
litellm_model_name=kwargs.get("model"),
request_kwargs=_optional_params,
api_base=api_base,
exception_type=_exception_type,
status_code=_status_code,
exception_string=_exception_string,
startTime=kwargs.get("start_time"),
endTime=kwargs.get("end_time"),
)
# helper function to convert to dict on pydantic v2 & v1
error_log_dict = _get_pydantic_json_dict(error_log)
error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
await prisma_client.db.litellm_errorlogs.create(
data=error_log_dict # type: ignore
)
pass
async def _PROXY_track_cost_callback(
kwargs, # kwargs to completion
completion_response: litellm.ModelResponse, # response from completion
@ -1292,6 +1364,15 @@ async def _PROXY_track_cost_callback(
verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
def error_tracking():
global prisma_client, custom_db_client
if prisma_client is not None or custom_db_client is not None:
if isinstance(litellm.failure_callback, list):
verbose_proxy_logger.debug("setting litellm failure callback to track cost")
if (_PROXY_failure_handler) not in litellm.failure_callback: # type: ignore
litellm.failure_callback.append(_PROXY_failure_handler) # type: ignore
def _set_spend_logs_payload(
payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
):
@ -2612,6 +2693,7 @@ class ProxyConfig:
environment_variables = config_data.get("environment_variables", {})
for k, v in environment_variables.items():
try:
if v is not None:
decoded_b64 = base64.b64decode(v)
value = decrypt_value(value=decoded_b64, master_key=master_key) # type: ignore
os.environ[k] = value
@ -2632,9 +2714,17 @@ class ProxyConfig:
if "alert_types" in _general_settings:
general_settings["alert_types"] = _general_settings["alert_types"]
proxy_logging_obj.alert_types = general_settings["alert_types"]
proxy_logging_obj.slack_alerting_instance.alert_types = general_settings[
"alert_types"
proxy_logging_obj.slack_alerting_instance.update_values(
alert_types=general_settings["alert_types"]
)
if "alert_to_webhook_url" in _general_settings:
general_settings["alert_to_webhook_url"] = _general_settings[
"alert_to_webhook_url"
]
proxy_logging_obj.slack_alerting_instance.update_values(
alert_to_webhook_url=general_settings["alert_to_webhook_url"]
)
# router settings
if llm_router is not None and prisma_client is not None:
@ -3176,6 +3266,9 @@ async def startup_event():
## COST TRACKING ##
cost_tracking()
## Error Tracking ##
error_tracking()
db_writer_client = HTTPHandler()
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
@ -3655,6 +3748,17 @@ async def chat_completion(
if data["model"] in litellm.model_alias_map:
data["model"] = litellm.model_alias_map[data["model"]]
## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
data["litellm_call_id"] = str(uuid.uuid4())
logging_obj, data = litellm.utils.function_setup(
original_function="acompletion",
rules_obj=litellm.utils.Rules(),
start_time=datetime.now(),
**data,
)
data["litellm_logging_obj"] = logging_obj
### CALL HOOKS ### - modify incoming data before calling the model
data = await proxy_logging_obj.pre_call_hook(
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
@ -7421,9 +7525,9 @@ async def model_info_v2(
)
async def model_metrics(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = None,
startTime: Optional[datetime] = datetime.now() - timedelta(days=30),
endTime: Optional[datetime] = datetime.now(),
_selected_model_group: Optional[str] = "gpt-4-32k",
startTime: Optional[datetime] = None,
endTime: Optional[datetime] = None,
):
global prisma_client, llm_router
if prisma_client is None:
@ -7433,65 +7537,214 @@ async def model_metrics(
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
if _selected_model_group and llm_router is not None:
_model_list = llm_router.get_model_list()
_relevant_api_bases = []
for model in _model_list:
if model["model_name"] == _selected_model_group:
_litellm_params = model["litellm_params"]
_api_base = _litellm_params.get("api_base", "")
_relevant_api_bases.append(_api_base)
_relevant_api_bases.append(_api_base + "/openai/")
startTime = startTime or datetime.now() - timedelta(days=30)
endTime = endTime or datetime.now()
sql_query = """
SELECT
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
COUNT(*) AS num_requests,
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
FROM "LiteLLM_SpendLogs"
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
AND api_base = ANY($3)
GROUP BY CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
ORDER BY num_requests DESC
LIMIT 50;
api_base,
model,
DATE_TRUNC('day', "startTime")::DATE AS day,
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) / SUM(total_tokens) AS avg_latency_per_token
FROM
"LiteLLM_SpendLogs"
WHERE
"startTime" >= NOW() - INTERVAL '30 days'
AND "model" = $1 AND "cache_hit" != 'True'
GROUP BY
api_base,
model,
day
HAVING
SUM(total_tokens) > 0
ORDER BY
avg_latency_per_token DESC;
"""
_all_api_bases = set()
db_response = await prisma_client.db.query_raw(
sql_query, _selected_model_group, startTime, endTime
)
_daily_entries: dict = {} # {"Jun 23": {"model1": 0.002, "model2": 0.003}}
if db_response is not None:
for model_data in db_response:
_api_base = model_data["api_base"]
_model = model_data["model"]
_day = model_data["day"]
_avg_latency_per_token = model_data["avg_latency_per_token"]
if _day not in _daily_entries:
_daily_entries[_day] = {}
_combined_model_name = str(_model)
if "https://" in _api_base:
_combined_model_name = str(_api_base)
if "/openai/" in _combined_model_name:
_combined_model_name = _combined_model_name.split("/openai/")[0]
_all_api_bases.add(_combined_model_name)
_daily_entries[_day][_combined_model_name] = _avg_latency_per_token
"""
each entry needs to be like this:
{
date: 'Jun 23',
'gpt-4-https://api.openai.com/v1/': 0.002,
'gpt-43-https://api.openai.com-12/v1/': 0.002,
}
"""
# convert daily entries to list of dicts
response: List[dict] = []
# sort daily entries by date
_daily_entries = dict(sorted(_daily_entries.items(), key=lambda item: item[0]))
for day in _daily_entries:
entry = {"date": str(day)}
for model_key, latency in _daily_entries[day].items():
entry[model_key] = latency
response.append(entry)
return {
"data": response,
"all_api_bases": list(_all_api_bases),
}
@router.get(
"/model/metrics/slow_responses",
description="View number of hanging requests per model_group",
tags=["model management"],
include_in_schema=False,
dependencies=[Depends(user_api_key_auth)],
)
async def model_metrics_slow_responses(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = "gpt-4-32k",
startTime: Optional[datetime] = None,
endTime: Optional[datetime] = None,
):
global prisma_client, llm_router, proxy_logging_obj
if prisma_client is None:
raise ProxyException(
message="Prisma Client is not initialized",
type="internal_error",
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
startTime = startTime or datetime.now() - timedelta(days=30)
endTime = endTime or datetime.now()
alerting_threshold = (
proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
)
alerting_threshold = int(alerting_threshold)
sql_query = """
SELECT
api_base,
COUNT(*) AS total_count,
SUM(CASE
WHEN ("endTime" - "startTime") >= (INTERVAL '1 SECOND' * CAST($1 AS INTEGER)) THEN 1
ELSE 0
END) AS slow_count
FROM
"LiteLLM_SpendLogs"
WHERE
"model" = $2
AND "cache_hit" != 'True'
GROUP BY
api_base
ORDER BY
slow_count DESC;
"""
db_response = await prisma_client.db.query_raw(
sql_query, startTime, endTime, _relevant_api_bases
sql_query, alerting_threshold, _selected_model_group
)
else:
sql_query = """
SELECT
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
COUNT(*) AS num_requests,
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
FROM
"LiteLLM_SpendLogs"
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
GROUP BY
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
ORDER BY
num_requests DESC
LIMIT 50;
if db_response is not None:
for row in db_response:
_api_base = row.get("api_base") or ""
if "/openai/" in _api_base:
_api_base = _api_base.split("/openai/")[0]
row["api_base"] = _api_base
return db_response
@router.get(
"/model/metrics/exceptions",
description="View number of failed requests per model on config.yaml",
tags=["model management"],
include_in_schema=False,
dependencies=[Depends(user_api_key_auth)],
)
async def model_metrics_exceptions(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = None,
startTime: Optional[datetime] = None,
endTime: Optional[datetime] = None,
):
global prisma_client, llm_router
if prisma_client is None:
raise ProxyException(
message="Prisma Client is not initialized",
type="internal_error",
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
startTime = startTime or datetime.now() - timedelta(days=30)
endTime = endTime or datetime.now()
"""
"""
sql_query = """
WITH cte AS (
SELECT
CASE WHEN api_base = '' THEN litellm_model_name ELSE CONCAT(litellm_model_name, '-', api_base) END AS combined_model_api_base,
exception_type,
COUNT(*) AS num_exceptions
FROM "LiteLLM_ErrorLogs"
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
GROUP BY combined_model_api_base, exception_type
)
SELECT
combined_model_api_base,
COUNT(*) AS total_exceptions,
json_object_agg(exception_type, num_exceptions) AS exception_counts
FROM cte
GROUP BY combined_model_api_base
ORDER BY total_exceptions DESC
LIMIT 200;
"""
db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
response: List[dict] = []
if response is not None:
exception_types = set()
"""
Return Data
{
"combined_model_api_base": "gpt-3.5-turbo-https://api.openai.com/v1/,
"total_exceptions": 5,
"BadRequestException": 5,
"TimeoutException": 2
}
"""
if db_response is not None:
# loop through all models
for model_data in db_response:
model = model_data.get("combined_model_api_base", "")
num_requests = model_data.get("num_requests", 0)
avg_latency_seconds = model_data.get("avg_latency_seconds", 0)
response.append(
{
total_exceptions = model_data.get("total_exceptions", 0)
exception_counts = model_data.get("exception_counts", {})
curr_row = {
"model": model,
"num_requests": num_requests,
"avg_latency_seconds": avg_latency_seconds,
"total_exceptions": total_exceptions,
}
)
return response
curr_row.update(exception_counts)
response.append(curr_row)
for k, v in exception_counts.items():
exception_types.add(k)
return {"data": response, "exception_types": list(exception_types)}
@router.get(
@ -8453,6 +8706,13 @@ async def update_config(config_info: ConfigYAML):
_existing_settings = config["general_settings"]
for k, v in updated_general_settings.items():
# overwrite existing settings with updated values
if k == "alert_to_webhook_url":
# check if slack is already enabled. if not, enable it
if "slack" not in _existing_settings:
if "alerting" not in _existing_settings:
_existing_settings["alerting"] = ["slack"]
elif isinstance(_existing_settings["alerting"], list):
_existing_settings["alerting"].append("slack")
_existing_settings[k] = v
config["general_settings"] = _existing_settings
@ -8567,7 +8827,25 @@ async def get_config():
"""
for _callback in _success_callbacks:
if _callback == "langfuse":
if _callback == "openmeter":
env_vars = [
"OPENMETER_API_KEY",
]
env_vars_dict = {}
for _var in env_vars:
env_variable = environment_variables.get(_var, None)
if env_variable is None:
env_vars_dict[_var] = None
else:
# decode + decrypt the value
decoded_b64 = base64.b64decode(env_variable)
_decrypted_value = decrypt_value(
value=decoded_b64, master_key=master_key
)
env_vars_dict[_var] = _decrypted_value
_data_to_return.append({"name": _callback, "variables": env_vars_dict})
elif _callback == "langfuse":
_langfuse_vars = [
"LANGFUSE_PUBLIC_KEY",
"LANGFUSE_SECRET_KEY",
@ -8592,6 +8870,7 @@ async def get_config():
# Check if slack alerting is on
_alerting = _general_settings.get("alerting", [])
alerting_data = []
if "slack" in _alerting:
_slack_vars = [
"SLACK_WEBHOOK_URL",
@ -8600,7 +8879,8 @@ async def get_config():
for _var in _slack_vars:
env_variable = environment_variables.get(_var, None)
if env_variable is None:
_slack_env_vars[_var] = None
_value = os.getenv("SLACK_WEBHOOK_URL", None)
_slack_env_vars[_var] = _value
else:
# decode + decrypt the value
decoded_b64 = base64.b64decode(env_variable)
@ -8613,19 +8893,23 @@ async def get_config():
_all_alert_types = (
proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
)
_data_to_return.append(
_alerts_to_webhook = (
proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
)
alerting_data.append(
{
"name": "slack",
"variables": _slack_env_vars,
"alerting_types": _alerting_types,
"all_alert_types": _all_alert_types,
"active_alerts": _alerting_types,
"alerts_to_webhook": _alerts_to_webhook,
}
)
_router_settings = llm_router.get_settings()
return {
"status": "success",
"data": _data_to_return,
"callbacks": _data_to_return,
"alerts": alerting_data,
"router_settings": _router_settings,
}
except Exception as e:
@ -8701,9 +8985,9 @@ async def test_endpoint(request: Request):
)
async def health_services_endpoint(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
service: Literal["slack_budget_alerts", "langfuse", "slack"] = fastapi.Query(
description="Specify the service being hit."
),
service: Literal[
"slack_budget_alerts", "langfuse", "slack", "openmeter"
] = fastapi.Query(description="Specify the service being hit."),
):
"""
Hidden endpoint.
@ -8717,7 +9001,7 @@ async def health_services_endpoint(
raise HTTPException(
status_code=400, detail={"error": "Service must be specified."}
)
if service not in ["slack_budget_alerts", "langfuse", "slack"]:
if service not in ["slack_budget_alerts", "langfuse", "slack", "openmeter"]:
raise HTTPException(
status_code=400,
detail={
@ -8725,6 +9009,18 @@ async def health_services_endpoint(
},
)
if service == "openmeter":
_ = await litellm.acompletion(
model="openai/litellm-mock-response-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
user="litellm:/health/services",
mock_response="This is a mock response",
)
return {
"status": "success",
"message": "Mock LLM request made - check openmeter.",
}
if service == "langfuse":
from litellm.integrations.langfuse import LangFuseLogger
@ -8741,9 +9037,53 @@ async def health_services_endpoint(
"message": "Mock LLM request made - check langfuse.",
}
if service == "slack" or service == "slack_budget_alerts":
if "slack" in general_settings.get("alerting", []):
test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
# test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
# check if user has opted into unique_alert_webhooks
if (
proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
is not None
):
for (
alert_type
) in proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url:
"""
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
"""
# only test alert if it's in active alert types
if (
proxy_logging_obj.slack_alerting_instance.alert_types
is not None
and alert_type
not in proxy_logging_obj.slack_alerting_instance.alert_types
):
continue
test_message = "default test message"
if alert_type == "llm_exceptions":
test_message = f"LLM Exception test alert"
elif alert_type == "llm_too_slow":
test_message = f"LLM Too Slow test alert"
elif alert_type == "llm_requests_hanging":
test_message = f"LLM Requests Hanging test alert"
elif alert_type == "budget_alerts":
test_message = f"Budget Alert test alert"
elif alert_type == "db_exceptions":
test_message = f"DB Exception test alert"
await proxy_logging_obj.alerting_handler(
message=test_message, level="Low", alert_type=alert_type
)
else:
await proxy_logging_obj.alerting_handler(
message="This is a test slack alert message",
level="Low",
alert_type="budget_alerts",
)
return {
"status": "success",
"message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
@ -8752,7 +9092,9 @@ async def health_services_endpoint(
raise HTTPException(
status_code=422,
detail={
"error": '"slack" not in proxy config: general_settings. Unable to test this.'
"error": '"{}" not in proxy config: general_settings. Unable to test this.'.format(
service
)
},
)
except Exception as e:
@ -8761,7 +9103,7 @@ async def health_services_endpoint(
message=getattr(e, "detail", f"Authentication Error({str(e)})"),
type="auth_error",
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_401_UNAUTHORIZED),
code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
)
elif isinstance(e, ProxyException):
raise e
@ -8769,7 +9111,7 @@ async def health_services_endpoint(
message="Authentication Error, " + str(e),
type="auth_error",
param=getattr(e, "param", "None"),
code=status.HTTP_401_UNAUTHORIZED,
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)

View file

@ -183,6 +183,21 @@ model LiteLLM_SpendLogs {
end_user String?
}
// View spend, model, api_key per request
model LiteLLM_ErrorLogs {
request_id String @id @default(uuid())
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
api_base String @default("")
model_group String @default("") // public model_name / model_group
litellm_model_name String @default("") // model passed to litellm
model_id String @default("") // ID of model in ProxyModelTable
request_kwargs Json @default("{}")
exception_type String @default("")
exception_string String @default("")
status_code String @default("")
}
// Beta - allow team members to request access to a model
model LiteLLM_UserNotifications {
request_id String @id

View file

@ -1,6 +1,6 @@
from typing import Optional, List, Any, Literal, Union
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
import litellm, backoff
import litellm, backoff, traceback
from litellm.proxy._types import (
UserAPIKeyAuth,
DynamoDBArgs,
@ -199,6 +199,33 @@ class ProxyLogging:
print_verbose(f"final data being sent to {call_type} call: {data}")
return data
except Exception as e:
if "litellm_logging_obj" in data:
logging_obj: litellm.utils.Logging = data["litellm_logging_obj"]
## ASYNC FAILURE HANDLER ##
error_message = ""
if isinstance(e, HTTPException):
if isinstance(e.detail, str):
error_message = e.detail
elif isinstance(e.detail, dict):
error_message = json.dumps(e.detail)
else:
error_message = str(e)
else:
error_message = str(e)
error_raised = Exception(f"{error_message}")
await logging_obj.async_failure_handler(
exception=error_raised,
traceback_exception=traceback.format_exc(),
)
## SYNC FAILURE HANDLER ##
try:
logging_obj.failure_handler(
error_raised, traceback.format_exc()
) # DO NOT MAKE THREADED - router retry fallback relies on this!
except Exception as error_val:
pass
raise e
async def during_call_hook(
@ -256,7 +283,16 @@ class ProxyLogging:
)
async def alerting_handler(
self, message: str, level: Literal["Low", "Medium", "High"]
self,
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
],
):
"""
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -289,7 +325,7 @@ class ProxyLogging:
for client in self.alerting:
if client == "slack":
await self.slack_alerting_instance.send_alert(
message=message, level=level
message=message, level=level, alert_type=alert_type
)
elif client == "sentry":
if litellm.utils.sentry_sdk_instance is not None:
@ -323,6 +359,7 @@ class ProxyLogging:
self.alerting_handler(
message=f"DB read/write call failed: {error_message}",
level="High",
alert_type="db_exceptions",
)
)
@ -354,7 +391,9 @@ class ProxyLogging:
return
asyncio.create_task(
self.alerting_handler(
message=f"LLM API call failed: {str(original_exception)}", level="High"
message=f"LLM API call failed: {str(original_exception)}",
level="High",
alert_type="llm_exceptions",
)
)
@ -1738,7 +1777,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
usage = response_obj["usage"]
if type(usage) == litellm.Usage:
usage = dict(usage)
id = response_obj.get("id", str(uuid.uuid4()))
id = response_obj.get("id", kwargs.get("litellm_call_id"))
api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
# hash the api_key
@ -2010,6 +2049,11 @@ async def update_spend(
raise e
### UPDATE KEY TABLE ###
verbose_proxy_logger.debug(
"KEY Spend transactions: {}".format(
len(prisma_client.key_list_transactons.keys())
)
)
if len(prisma_client.key_list_transactons.keys()) > 0:
for i in range(n_retry_times + 1):
start_time = time.time()

View file

@ -50,7 +50,6 @@ class Router:
model_names: List = []
cache_responses: Optional[bool] = False
default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour
num_retries: int = 0
tenacity = None
leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
@ -70,9 +69,11 @@ class Router:
] = None, # if you want to cache across model groups
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
## RELIABILITY ##
num_retries: int = 0,
num_retries: Optional[int] = None,
timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create
default_litellm_params: Optional[
dict
] = None, # default params for Router.chat.completion.create
default_max_parallel_requests: Optional[int] = None,
set_verbose: bool = False,
debug_level: Literal["DEBUG", "INFO"] = "INFO",
@ -158,6 +159,7 @@ class Router:
router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
```
"""
if semaphore:
self.semaphore = semaphore
self.set_verbose = set_verbose
@ -229,7 +231,14 @@ class Router:
self.failed_calls = (
InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
self.num_retries = num_retries or litellm.num_retries or 0
if num_retries is not None:
self.num_retries = num_retries
elif litellm.num_retries is not None:
self.num_retries = litellm.num_retries
else:
self.num_retries = openai.DEFAULT_MAX_RETRIES
self.timeout = timeout or litellm.request_timeout
self.retry_after = retry_after
@ -255,6 +264,7 @@ class Router:
) # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group
# make Router.chat.completions.create compatible for openai.chat.completions.create
default_litellm_params = default_litellm_params or {}
self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)
# default litellm args
@ -280,6 +290,21 @@ class Router:
}
"""
### ROUTING SETUP ###
self.routing_strategy_init(
routing_strategy=routing_strategy,
routing_strategy_args=routing_strategy_args,
)
## COOLDOWNS ##
if isinstance(litellm.failure_callback, list):
litellm.failure_callback.append(self.deployment_callback_on_failure)
else:
litellm.failure_callback = [self.deployment_callback_on_failure]
print( # noqa
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
) # noqa
self.routing_strategy_args = routing_strategy_args
def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
if routing_strategy == "least-busy":
self.leastbusy_logger = LeastBusyLoggingHandler(
router_cache=self.cache, model_list=self.model_list
@ -311,15 +336,6 @@ class Router:
)
if isinstance(litellm.callbacks, list):
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
## COOLDOWNS ##
if isinstance(litellm.failure_callback, list):
litellm.failure_callback.append(self.deployment_callback_on_failure)
else:
litellm.failure_callback = [self.deployment_callback_on_failure]
verbose_router_logger.info(
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
)
self.routing_strategy_args = routing_strategy_args
def print_deployment(self, deployment: dict):
"""
@ -428,6 +444,7 @@ class Router:
kwargs["messages"] = messages
kwargs["original_function"] = self._acompletion
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
timeout = kwargs.get("request_timeout", self.timeout)
kwargs.setdefault("metadata", {}).update({"model_group": model})
@ -469,6 +486,7 @@ class Router:
)
kwargs["model_info"] = deployment.get("model_info", {})
data = deployment["litellm_params"].copy()
model_name = data["model"]
for k, v in self.default_litellm_params.items():
if (
@ -1415,10 +1433,12 @@ class Router:
context_window_fallbacks = kwargs.pop(
"context_window_fallbacks", self.context_window_fallbacks
)
verbose_router_logger.debug(
f"async function w/ retries: original_function - {original_function}"
)
num_retries = kwargs.pop("num_retries")
verbose_router_logger.debug(
f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
)
try:
# if the function call is successful, no exception will be raised and we'll break out of the loop
response = await original_function(*args, **kwargs)
@ -1436,37 +1456,47 @@ class Router:
raise original_exception
### RETRY
#### check if it should retry + back-off if required
if "No models available" in str(e):
timeout = litellm._calculate_retry_after(
remaining_retries=num_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
await asyncio.sleep(timeout)
elif RouterErrors.user_defined_ratelimit_error.value in str(e):
raise e # don't wait to retry if deployment hits user-defined rate-limit
elif hasattr(original_exception, "status_code") and litellm._should_retry(
status_code=original_exception.status_code
):
if hasattr(original_exception, "response") and hasattr(
original_exception.response, "headers"
):
timeout = litellm._calculate_retry_after(
remaining_retries=num_retries,
max_retries=num_retries,
response_headers=original_exception.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=num_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
await asyncio.sleep(timeout)
else:
raise original_exception
# if "No models available" in str(
# e
# ) or RouterErrors.no_deployments_available.value in str(e):
# timeout = litellm._calculate_retry_after(
# remaining_retries=num_retries,
# max_retries=num_retries,
# min_timeout=self.retry_after,
# )
# await asyncio.sleep(timeout)
# elif RouterErrors.user_defined_ratelimit_error.value in str(e):
# raise e # don't wait to retry if deployment hits user-defined rate-limit
# elif hasattr(original_exception, "status_code") and litellm._should_retry(
# status_code=original_exception.status_code
# ):
# if hasattr(original_exception, "response") and hasattr(
# original_exception.response, "headers"
# ):
# timeout = litellm._calculate_retry_after(
# remaining_retries=num_retries,
# max_retries=num_retries,
# response_headers=original_exception.response.headers,
# min_timeout=self.retry_after,
# )
# else:
# timeout = litellm._calculate_retry_after(
# remaining_retries=num_retries,
# max_retries=num_retries,
# min_timeout=self.retry_after,
# )
# await asyncio.sleep(timeout)
# else:
# raise original_exception
### RETRY
_timeout = self._router_should_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
)
await asyncio.sleep(_timeout)
## LOGGING
if num_retries > 0:
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
@ -1488,34 +1518,12 @@ class Router:
## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=e)
remaining_retries = num_retries - current_attempt
if "No models available" in str(e):
timeout = litellm._calculate_retry_after(
_timeout = self._router_should_retry(
e=original_exception,
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
num_retries=num_retries,
)
await asyncio.sleep(timeout)
elif (
hasattr(e, "status_code")
and hasattr(e, "response")
and litellm._should_retry(status_code=e.status_code)
):
if hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
await asyncio.sleep(timeout)
else:
raise e
await asyncio.sleep(_timeout)
raise original_exception
def function_with_fallbacks(self, *args, **kwargs):
@ -1606,6 +1614,27 @@ class Router:
raise e
raise original_exception
def _router_should_retry(
self, e: Exception, remaining_retries: int, num_retries: int
) -> Union[int, float]:
"""
Calculate back-off, then retry
"""
if hasattr(e, "response") and hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
return timeout
def function_with_retries(self, *args, **kwargs):
"""
Try calling the model 3 times. Shuffle between available deployments.
@ -1619,15 +1648,13 @@ class Router:
context_window_fallbacks = kwargs.pop(
"context_window_fallbacks", self.context_window_fallbacks
)
try:
# if the function call is successful, no exception will be raised and we'll break out of the loop
response = original_function(*args, **kwargs)
return response
except Exception as e:
original_exception = e
verbose_router_logger.debug(
f"num retries in function with retries: {num_retries}"
)
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
if (
isinstance(original_exception, litellm.ContextWindowExceededError)
@ -1641,6 +1668,12 @@ class Router:
if num_retries > 0:
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
### RETRY
_timeout = self._router_should_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
)
time.sleep(_timeout)
for current_attempt in range(num_retries):
verbose_router_logger.debug(
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
@ -1654,34 +1687,12 @@ class Router:
## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=e)
remaining_retries = num_retries - current_attempt
if "No models available" in str(e):
timeout = litellm._calculate_retry_after(
_timeout = self._router_should_retry(
e=e,
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
num_retries=num_retries,
)
time.sleep(timeout)
elif (
hasattr(e, "status_code")
and hasattr(e, "response")
and litellm._should_retry(status_code=e.status_code)
):
if hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
time.sleep(timeout)
else:
raise e
time.sleep(_timeout)
raise original_exception
### HELPER FUNCTIONS
@ -1715,10 +1726,11 @@ class Router:
) # i.e. azure
metadata = kwargs.get("litellm_params", {}).get("metadata", None)
_model_info = kwargs.get("litellm_params", {}).get("model_info", {})
if isinstance(_model_info, dict):
deployment_id = _model_info.get("id", None)
self._set_cooldown_deployments(
deployment_id
exception_status=exception_status, deployment=deployment_id
) # setting deployment_id in cooldown deployments
if custom_llm_provider:
model_name = f"{custom_llm_provider}/{model_name}"
@ -1778,9 +1790,15 @@ class Router:
key=rpm_key, value=request_count, local_only=True
) # don't change existing ttl
def _set_cooldown_deployments(self, deployment: Optional[str] = None):
def _set_cooldown_deployments(
self, exception_status: Union[str, int], deployment: Optional[str] = None
):
"""
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
or
the exception is not one that should be immediately retried (e.g. 401)
"""
if deployment is None:
return
@ -1797,7 +1815,20 @@ class Router:
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
)
cooldown_time = self.cooldown_time or 1
if updated_fails > self.allowed_fails:
if isinstance(exception_status, str):
try:
exception_status = int(exception_status)
except Exception as e:
verbose_router_logger.debug(
"Unable to cast exception status to int {}. Defaulting to status=500.".format(
exception_status
)
)
exception_status = 500
_should_retry = litellm._should_retry(status_code=exception_status)
if updated_fails > self.allowed_fails or _should_retry == False:
# get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
cached_value = self.cache.get_cache(key=cooldown_key)
@ -1929,6 +1960,7 @@ class Router:
)
default_api_base = api_base
default_api_key = api_key
if (
model_name in litellm.open_ai_chat_completion_models
or custom_llm_provider in litellm.openai_compatible_providers
@ -1940,8 +1972,10 @@ class Router:
or "ft:gpt-3.5-turbo" in model_name
or model_name in litellm.open_ai_embedding_models
):
is_azure_ai_studio_model: bool = False
if custom_llm_provider == "azure":
if litellm.utils._is_non_openai_azure_model(model_name):
is_azure_ai_studio_model = True
custom_llm_provider = "openai"
# remove azure prefx from model_name
model_name = model_name.replace("azure/", "")
@ -1964,6 +1998,25 @@ class Router:
api_base = litellm.get_secret(api_base_env_name)
litellm_params["api_base"] = api_base
## AZURE AI STUDIO MISTRAL CHECK ##
"""
Make sure api base ends in /v1/
if not, add it - https://github.com/BerriAI/litellm/issues/2279
"""
if (
is_azure_ai_studio_model == True
and api_base is not None
and not api_base.endswith("/v1/")
):
# check if it ends with a trailing slash
if api_base.endswith("/"):
api_base += "v1/"
elif api_base.endswith("/v1"):
api_base += "/"
else:
api_base += "/v1/"
api_version = litellm_params.get("api_version")
if api_version and api_version.startswith("os.environ/"):
api_version_env_name = api_version.replace("os.environ/", "")
@ -1986,7 +2039,9 @@ class Router:
stream_timeout = litellm.get_secret(stream_timeout_env_name)
litellm_params["stream_timeout"] = stream_timeout
max_retries = litellm_params.pop("max_retries", 2)
max_retries = litellm_params.pop(
"max_retries", 0
) # router handles retry logic
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
max_retries_env_name = max_retries.replace("os.environ/", "")
max_retries = litellm.get_secret(max_retries_env_name)
@ -2052,10 +2107,12 @@ class Router:
timeout=timeout,
max_retries=max_retries,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
), # type: ignore
)
@ -2074,10 +2131,12 @@ class Router:
timeout=timeout,
max_retries=max_retries,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=sync_proxy_mounts,
), # type: ignore
)
@ -2096,10 +2155,12 @@ class Router:
timeout=stream_timeout,
max_retries=max_retries,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
), # type: ignore
)
@ -2118,10 +2179,12 @@ class Router:
timeout=stream_timeout,
max_retries=max_retries,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=sync_proxy_mounts,
), # type: ignore
)
@ -2158,10 +2221,12 @@ class Router:
timeout=timeout,
max_retries=max_retries,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
), # type: ignore
)
@ -2178,10 +2243,12 @@ class Router:
timeout=timeout,
max_retries=max_retries,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
verify=litellm.ssl_verify,
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
),
mounts=sync_proxy_mounts,
), # type: ignore
)
@ -2199,10 +2266,12 @@ class Router:
timeout=stream_timeout,
max_retries=max_retries,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
),
)
@ -2219,10 +2288,12 @@ class Router:
timeout=stream_timeout,
max_retries=max_retries,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=sync_proxy_mounts,
),
)
@ -2249,10 +2320,12 @@ class Router:
max_retries=max_retries,
organization=organization,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
), # type: ignore
)
@ -2271,10 +2344,12 @@ class Router:
max_retries=max_retries,
organization=organization,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=sync_proxy_mounts,
), # type: ignore
)
@ -2294,10 +2369,12 @@ class Router:
max_retries=max_retries,
organization=organization,
http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
transport=AsyncCustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=async_proxy_mounts,
), # type: ignore
)
@ -2317,10 +2394,12 @@ class Router:
max_retries=max_retries,
organization=organization,
http_client=httpx.Client(
transport=CustomHTTPTransport(),
transport=CustomHTTPTransport(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
mounts=sync_proxy_mounts,
), # type: ignore
)
@ -2550,6 +2629,11 @@ class Router:
for var in vars_to_include:
if var in _all_vars:
_settings_to_return[var] = _all_vars[var]
if (
var == "routing_strategy_args"
and self.routing_strategy == "latency-based-routing"
):
_settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
return _settings_to_return
def update_settings(self, **kwargs):
@ -2581,6 +2665,13 @@ class Router:
_casted_value = int(kwargs[var])
setattr(self, var, _casted_value)
else:
if var == "routing_strategy":
self.routing_strategy_init(
routing_strategy=kwargs[var],
routing_strategy_args=kwargs.get(
"routing_strategy_args", {}
),
)
setattr(self, var, kwargs[var])
else:
verbose_router_logger.debug("Setting {} is not allowed".format(var))
@ -2717,7 +2808,10 @@ class Router:
self.cache.get_cache(key=model_id, local_only=True) or 0
)
### get usage based cache ###
if isinstance(model_group_cache, dict):
if (
isinstance(model_group_cache, dict)
and self.routing_strategy != "usage-based-routing-v2"
):
model_group_cache[model_id] = model_group_cache.get(model_id, 0)
current_request = max(
@ -2745,7 +2839,7 @@ class Router:
if _rate_limit_error == True: # allow generic fallback logic to take place
raise ValueError(
f"No deployments available for selected model, passed model={model}"
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
)
elif _context_window_error == True:
raise litellm.ContextWindowExceededError(
@ -2883,6 +2977,11 @@ class Router:
model=model, healthy_deployments=healthy_deployments, messages=messages
)
if len(healthy_deployments) == 0:
raise ValueError(
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
)
if (
self.routing_strategy == "usage-based-routing-v2"
and self.lowesttpm_logger_v2 is not None
@ -2938,7 +3037,7 @@ class Router:
f"get_available_deployment for model: {model}, No deployment available"
)
raise ValueError(
f"No deployments available for selected model, passed model={model}"
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
)
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
@ -3068,7 +3167,7 @@ class Router:
f"get_available_deployment for model: {model}, No deployment available"
)
raise ValueError(
f"No deployments available for selected model, passed model={model}"
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
)
verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"

View file

@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
import dotenv, os, requests, random
from typing import Optional, Union, List, Dict
from datetime import datetime, timedelta
import random
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):
class RoutingArgs(LiteLLMBase):
ttl: int = 1 * 60 * 60 # 1 hour
lowest_latency_buffer: float = 0
class LowestLatencyLoggingHandler(CustomLogger):
@ -312,6 +314,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
except:
input_tokens = 0
# randomly sample from all_deployments, incase all deployments have latency=0.0
_items = all_deployments.items()
all_deployments = random.sample(list(_items), len(_items))
all_deployments = dict(all_deployments)
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
potential_deployments = []
for item, item_map in all_deployments.items():
## get the item from model list
_deployment = None
@ -345,23 +355,48 @@ class LowestLatencyLoggingHandler(CustomLogger):
if isinstance(_call_latency, float):
total += _call_latency
item_latency = total / len(item_latency)
if item_latency == 0:
deployment = _deployment
break
elif (
# -------------- #
# Debugging Logic
# -------------- #
# We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
# this helps a user to debug why the router picked a specfic deployment #
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
if _deployment_api_base is not None:
_latency_per_deployment[_deployment_api_base] = item_latency
# -------------- #
# End of Debugging Logic
# -------------- #
if (
item_tpm + input_tokens > _deployment_tpm
or item_rpm + 1 > _deployment_rpm
): # if user passed in tpm / rpm in the model_list
continue
elif item_latency < lowest_latency:
lowest_latency = item_latency
deployment = _deployment
else:
potential_deployments.append((_deployment, item_latency))
if len(potential_deployments) == 0:
return None
# Sort potential deployments by latency
sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
# Find lowest latency deployment
lowest_latency = sorted_deployments[0][1]
# Find deployments within buffer of lowest latency
buffer = self.routing_args.lowest_latency_buffer * lowest_latency
valid_deployments = [
x for x in sorted_deployments if x[1] <= lowest_latency + buffer
]
# Pick a random deployment from valid deployments
random_valid_deployment = random.choice(valid_deployments)
deployment = random_valid_deployment[0]
# _latency_per_deployment is used for debuggig
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
_latency_per_deployment[_deployment_api_base] = item_latency
if request_kwargs is not None and "metadata" in request_kwargs:
request_kwargs["metadata"][
"_latency_per_deployment"

View file

@ -206,7 +206,7 @@ class LowestTPMLoggingHandler(CustomLogger):
if item_tpm + input_tokens > _deployment_tpm:
continue
elif (rpm_dict is not None and item in rpm_dict) and (
rpm_dict[item] + 1 > _deployment_rpm
rpm_dict[item] + 1 >= _deployment_rpm
):
continue
elif item_tpm < lowest_tpm:

View file

@ -333,7 +333,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
tpm_dict[tpm_key] = 0
all_deployments = tpm_dict
deployment = None
potential_deployments = [] # if multiple deployments have the same low value
for item, item_tpm in all_deployments.items():
## get the item from model list
_deployment = None
@ -343,6 +343,8 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
_deployment = m
if _deployment is None:
continue # skip to next one
elif item_tpm is None:
continue # skip if unhealthy deployment
_deployment_tpm = None
if _deployment_tpm is None:
@ -366,14 +368,20 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
if item_tpm + input_tokens > _deployment_tpm:
continue
elif (rpm_dict is not None and item in rpm_dict) and (
rpm_dict[item] + 1 > _deployment_rpm
rpm_dict[item] + 1 >= _deployment_rpm
):
continue
elif item_tpm == lowest_tpm:
potential_deployments.append(_deployment)
elif item_tpm < lowest_tpm:
lowest_tpm = item_tpm
deployment = _deployment
potential_deployments = [_deployment]
print_verbose("returning picked lowest tpm/rpm deployment.")
return deployment
if len(potential_deployments) > 0:
return random.choice(potential_deployments)
else:
return None
async def async_get_available_deployments(
self,
@ -394,6 +402,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
tpm_keys = []
rpm_keys = []
for m in healthy_deployments:
@ -416,7 +425,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]
return self._common_checks_available_deployment(
deployment = self._common_checks_available_deployment(
model_group=model_group,
healthy_deployments=healthy_deployments,
tpm_keys=tpm_keys,
@ -427,6 +436,61 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
input=input,
)
try:
assert deployment is not None
return deployment
except Exception as e:
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
deployment_dict = {}
for index, _deployment in enumerate(healthy_deployments):
if isinstance(_deployment, dict):
id = _deployment.get("model_info", {}).get("id")
### GET DEPLOYMENT TPM LIMIT ###
_deployment_tpm = None
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("tpm", None)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("litellm_params", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("model_info", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = float("inf")
### GET CURRENT TPM ###
current_tpm = tpm_values[index]
### GET DEPLOYMENT TPM LIMIT ###
_deployment_rpm = None
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("rpm", None)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("litellm_params", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("model_info", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = float("inf")
### GET CURRENT RPM ###
current_rpm = rpm_values[index]
deployment_dict[id] = {
"current_tpm": current_tpm,
"tpm_limit": _deployment_tpm,
"current_rpm": current_rpm,
"rpm_limit": _deployment_rpm,
}
raise ValueError(
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
)
def get_available_deployments(
self,
model_group: str,
@ -464,7 +528,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
keys=rpm_keys
) # [1, 2, None, ..]
return self._common_checks_available_deployment(
deployment = self._common_checks_available_deployment(
model_group=model_group,
healthy_deployments=healthy_deployments,
tpm_keys=tpm_keys,
@ -474,3 +538,58 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
messages=messages,
input=input,
)
try:
assert deployment is not None
return deployment
except Exception as e:
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
deployment_dict = {}
for index, _deployment in enumerate(healthy_deployments):
if isinstance(_deployment, dict):
id = _deployment.get("model_info", {}).get("id")
### GET DEPLOYMENT TPM LIMIT ###
_deployment_tpm = None
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("tpm", None)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("litellm_params", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("model_info", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = float("inf")
### GET CURRENT TPM ###
current_tpm = tpm_values[index]
### GET DEPLOYMENT TPM LIMIT ###
_deployment_rpm = None
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("rpm", None)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("litellm_params", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("model_info", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = float("inf")
### GET CURRENT RPM ###
current_rpm = rpm_values[index]
deployment_dict[id] = {
"current_tpm": current_tpm,
"tpm_limit": _deployment_tpm,
"current_rpm": current_rpm,
"rpm_limit": _deployment_rpm,
}
raise ValueError(
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
)

View file

@ -19,6 +19,7 @@ def setup_and_teardown():
0, os.path.abspath("../..")
) # Adds the project directory to the system path
import litellm
from litellm import Router
importlib.reload(litellm)
import asyncio

View file

@ -119,7 +119,9 @@ def test_multiple_deployments_parallel():
# test_multiple_deployments_parallel()
def test_cooldown_same_model_name():
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_cooldown_same_model_name(sync_mode):
# users could have the same model with different api_base
# example
# azure/chatgpt, api_base: 1234
@ -161,6 +163,7 @@ def test_cooldown_same_model_name():
num_retries=3,
) # type: ignore
if sync_mode:
response = router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hello this request will pass"}],
@ -176,6 +179,23 @@ def test_cooldown_same_model_name():
model_ids[0] != model_ids[1]
) # ensure both models have a uuid added, and they have different names
print("\ngot response\n", response)
else:
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hello this request will pass"}],
)
print(router.model_list)
model_ids = []
for model in router.model_list:
model_ids.append(model["model_info"]["id"])
print("\n litellm model ids ", model_ids)
# example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
assert (
model_ids[0] != model_ids[1]
) # ensure both models have a uuid added, and they have different names
print("\ngot response\n", response)
except Exception as e:
pytest.fail(f"Got unexpected exception on router! - {e}")

View file

@ -161,40 +161,56 @@ async def make_async_calls():
return total_time
# def test_langfuse_logging_async_text_completion():
# try:
# pre_langfuse_setup()
# litellm.set_verbose = False
# litellm.success_callback = ["langfuse"]
@pytest.mark.asyncio
@pytest.mark.parametrize("stream", [False, True])
async def test_langfuse_logging_without_request_response(stream):
try:
import uuid
# async def _test_langfuse():
# response = await litellm.atext_completion(
# model="gpt-3.5-turbo-instruct",
# prompt="this is a test",
# max_tokens=5,
# temperature=0.7,
# timeout=5,
# user="test_user",
# stream=True
# )
# async for chunk in response:
# print()
# print(chunk)
# await asyncio.sleep(1)
# return response
_unique_trace_name = f"litellm-test-{str(uuid.uuid4())}"
litellm.set_verbose = True
litellm.turn_off_message_logging = True
litellm.success_callback = ["langfuse"]
response = await litellm.acompletion(
model="gpt-3.5-turbo",
mock_response="It's simple to use and easy to get started",
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
max_tokens=10,
temperature=0.2,
stream=stream,
metadata={"trace_id": _unique_trace_name},
)
print(response)
if stream:
async for chunk in response:
print(chunk)
# response = asyncio.run(_test_langfuse())
# print(f"response: {response}")
await asyncio.sleep(3)
# # # check langfuse.log to see if there was a failed response
# search_logs("langfuse.log")
# except litellm.Timeout as e:
# pass
# except Exception as e:
# pytest.fail(f"An exception occurred - {e}")
import langfuse
langfuse_client = langfuse.Langfuse(
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
)
# test_langfuse_logging_async_text_completion()
# get trace with _unique_trace_name
trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
print("trace_from_langfuse", trace)
_trace_data = trace.data
assert _trace_data[0].input == {"messages": "redacted-by-litellm"}
assert _trace_data[0].output == {
"role": "assistant",
"content": "redacted-by-litellm",
"function_call": None,
"tool_calls": None,
}
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
@pytest.mark.skip(reason="beta test - checking langfuse output")
@ -334,6 +350,220 @@ def test_langfuse_logging_function_calling():
# test_langfuse_logging_function_calling()
def test_langfuse_existing_trace_id():
"""
When existing trace id is passed, don't set trace params -> prevents overwriting the trace
Pass 1 logging object with a trace
Pass 2nd logging object with the trace id
Assert no changes to the trace
"""
# Test - if the logs were sent to the correct team on langfuse
import litellm, datetime
from litellm.integrations.langfuse import LangFuseLogger
langfuse_Logger = LangFuseLogger(
langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
langfuse_secret=os.getenv("LANGFUSE_PROJECT2_SECRET"),
)
litellm.success_callback = ["langfuse"]
# langfuse_args = {'kwargs': { 'start_time': 'end_time': datetime.datetime(2024, 5, 1, 7, 31, 29, 903685), 'user_id': None, 'print_verbose': <function print_verbose at 0x109d1f420>, 'level': 'DEFAULT', 'status_message': None}
response_obj = litellm.ModelResponse(
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
choices=[
litellm.Choices(
finish_reason="stop",
index=0,
message=litellm.Message(
content="I'm sorry, I am an AI assistant and do not have real-time information. I recommend checking a reliable weather website or app for the most up-to-date weather information in Boston.",
role="assistant",
),
)
],
created=1714573888,
model="gpt-3.5-turbo-0125",
object="chat.completion",
system_fingerprint="fp_3b956da36b",
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
)
### NEW TRACE ###
message = [{"role": "user", "content": "what's the weather in boston"}]
langfuse_args = {
"response_obj": response_obj,
"kwargs": {
"model": "gpt-3.5-turbo",
"litellm_params": {
"acompletion": False,
"api_key": None,
"force_timeout": 600,
"logger_fn": None,
"verbose": False,
"custom_llm_provider": "openai",
"api_base": "https://api.openai.com/v1/",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"model_alias_map": {},
"completion_call_id": None,
"metadata": None,
"model_info": None,
"proxy_server_request": None,
"preset_cache_key": None,
"no-log": False,
"stream_response": {},
},
"messages": message,
"optional_params": {"temperature": 0.1, "extra_body": {}},
"start_time": "2024-05-01 07:31:27.986164",
"stream": False,
"user": None,
"call_type": "completion",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"completion_start_time": "2024-05-01 07:31:29.903685",
"temperature": 0.1,
"extra_body": {},
"input": [{"role": "user", "content": "what's the weather in boston"}],
"api_key": "my-api-key",
"additional_args": {
"complete_input_dict": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "what's the weather in boston"}
],
"temperature": 0.1,
"extra_body": {},
}
},
"log_event_type": "successful_api_call",
"end_time": "2024-05-01 07:31:29.903685",
"cache_hit": None,
"response_cost": 6.25e-05,
},
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
"user_id": None,
"print_verbose": litellm.print_verbose,
"level": "DEFAULT",
"status_message": None,
}
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
import langfuse
langfuse_client = langfuse.Langfuse(
public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
)
trace_id = langfuse_response_object["trace_id"]
langfuse_client.flush()
time.sleep(2)
print(langfuse_client.get_trace(id=trace_id))
initial_langfuse_trace = langfuse_client.get_trace(id=trace_id)
### EXISTING TRACE ###
new_metadata = {"existing_trace_id": trace_id}
new_messages = [{"role": "user", "content": "What do you know?"}]
new_response_obj = litellm.ModelResponse(
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
choices=[
litellm.Choices(
finish_reason="stop",
index=0,
message=litellm.Message(
content="What do I know?",
role="assistant",
),
)
],
created=1714573888,
model="gpt-3.5-turbo-0125",
object="chat.completion",
system_fingerprint="fp_3b956da36b",
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
)
langfuse_args = {
"response_obj": new_response_obj,
"kwargs": {
"model": "gpt-3.5-turbo",
"litellm_params": {
"acompletion": False,
"api_key": None,
"force_timeout": 600,
"logger_fn": None,
"verbose": False,
"custom_llm_provider": "openai",
"api_base": "https://api.openai.com/v1/",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"model_alias_map": {},
"completion_call_id": None,
"metadata": new_metadata,
"model_info": None,
"proxy_server_request": None,
"preset_cache_key": None,
"no-log": False,
"stream_response": {},
},
"messages": new_messages,
"optional_params": {"temperature": 0.1, "extra_body": {}},
"start_time": "2024-05-01 07:31:27.986164",
"stream": False,
"user": None,
"call_type": "completion",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"completion_start_time": "2024-05-01 07:31:29.903685",
"temperature": 0.1,
"extra_body": {},
"input": [{"role": "user", "content": "what's the weather in boston"}],
"api_key": "my-api-key",
"additional_args": {
"complete_input_dict": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "what's the weather in boston"}
],
"temperature": 0.1,
"extra_body": {},
}
},
"log_event_type": "successful_api_call",
"end_time": "2024-05-01 07:31:29.903685",
"cache_hit": None,
"response_cost": 6.25e-05,
},
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
"user_id": None,
"print_verbose": litellm.print_verbose,
"level": "DEFAULT",
"status_message": None,
}
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
new_trace_id = langfuse_response_object["trace_id"]
assert new_trace_id == trace_id
langfuse_client.flush()
time.sleep(2)
print(langfuse_client.get_trace(id=trace_id))
new_langfuse_trace = langfuse_client.get_trace(id=trace_id)
assert dict(initial_langfuse_trace) == dict(new_langfuse_trace)
def test_langfuse_logging_tool_calling():
litellm.set_verbose = True

View file

@ -68,6 +68,7 @@ async def test_get_api_base():
await _pl.alerting_handler(
message=slow_message + request_info,
level="Low",
alert_type="llm_too_slow",
)
print("passed test_get_api_base")

View file

@ -394,6 +394,8 @@ async def test_async_vertexai_response():
pass
except litellm.Timeout as e:
pass
except litellm.APIError as e:
pass
except Exception as e:
pytest.fail(f"An exception occurred: {e}")
@ -636,7 +638,10 @@ def test_gemini_pro_function_calling():
# gemini_pro_function_calling()
def test_gemini_pro_function_calling_streaming():
@pytest.mark.parametrize("stream", [False, True])
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
async def test_gemini_pro_function_calling_streaming(stream, sync_mode):
load_vertex_ai_credentials()
litellm.set_verbose = True
tools = [
@ -665,19 +670,41 @@ def test_gemini_pro_function_calling_streaming():
"content": "What's the weather like in Boston today in fahrenheit?",
}
]
optional_params = {
"tools": tools,
"tool_choice": "auto",
"n": 1,
"stream": stream,
"temperature": 0.1,
}
try:
completion = litellm.completion(
model="gemini-pro",
messages=messages,
tools=tools,
tool_choice="auto",
stream=True,
if sync_mode == True:
response = litellm.completion(
model="gemini-pro", messages=messages, **optional_params
)
print(f"completion: {completion}")
print(f"completion: {response}")
if stream == True:
# assert completion.choices[0].message.content is None
# assert len(completion.choices[0].message.tool_calls) == 1
for chunk in completion:
for chunk in response:
assert isinstance(chunk, litellm.ModelResponse)
else:
assert isinstance(response, litellm.ModelResponse)
else:
response = await litellm.acompletion(
model="gemini-pro", messages=messages, **optional_params
)
print(f"completion: {response}")
if stream == True:
# assert completion.choices[0].message.content is None
# assert len(completion.choices[0].message.tool_calls) == 1
async for chunk in response:
print(f"chunk: {chunk}")
assert isinstance(chunk, litellm.ModelResponse)
else:
assert isinstance(response, litellm.ModelResponse)
except litellm.APIError as e:
pass
except litellm.RateLimitError as e:

View file

@ -57,7 +57,7 @@ def test_completion_custom_provider_model_name():
messages=messages,
logger_fn=logger_fn,
)
# Add any assertions here to, check the response
# Add any assertions here to,check the response
print(response)
print(response["choices"][0]["finish_reason"])
except litellm.Timeout as e:
@ -231,6 +231,76 @@ def test_completion_claude_3_function_call():
pytest.fail(f"Error occurred: {e}")
def test_completion_cohere_command_r_plus_function_call():
litellm.set_verbose = True
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
}
]
try:
# test without max tokens
response = completion(
model="command-r-plus",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
messages.append(
response.choices[0].message.model_dump()
) # Add assistant tool invokes
tool_result = (
'{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
)
# Add user submitted tool results in the OpenAI format
messages.append(
{
"tool_call_id": response.choices[0].message.tool_calls[0].id,
"role": "tool",
"name": response.choices[0].message.tool_calls[0].function.name,
"content": tool_result,
}
)
# In the second response, Cohere should deduce answer from tool results
second_response = completion(
model="command-r-plus",
messages=messages,
tools=tools,
tool_choice="auto",
)
print(second_response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_parse_xml_params():
from litellm.llms.prompt_templates.factory import parse_xml_params
@ -1291,6 +1361,7 @@ def test_completion_logprobs_stream():
for chunk in response:
# check if atleast one chunk has log probs
print(chunk)
print(f"chunk.choices[0]: {chunk.choices[0]}")
if "logprobs" in chunk.choices[0]:
# assert we got a valid logprob in the choices
assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
@ -1781,7 +1852,6 @@ def test_completion_replicate_llama3():
print("RESPONSE STRING\n", response_str)
if type(response_str) != str:
pytest.fail(f"Error occurred: {e}")
raise Exception("it worked!")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@ -2655,6 +2725,88 @@ def test_completion_palm_stream():
pytest.fail(f"Error occurred: {e}")
def test_completion_watsonx():
litellm.set_verbose = True
model_name = "watsonx/ibm/granite-13b-chat-v2"
try:
response = completion(
model=model_name,
messages=messages,
stop=["stop"],
max_tokens=20,
)
# Add any assertions here to check the response
print(response)
except litellm.APIError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
"provider, model, project, region_name, token",
[
("azure", "chatgpt-v-2", None, None, "test-token"),
("vertex_ai", "anthropic-claude-3", "adroit-crow-1", "us-east1", None),
("watsonx", "ibm/granite", "96946574", "dallas", "1234"),
("bedrock", "anthropic.claude-3", None, "us-east-1", None),
],
)
def test_unified_auth_params(provider, model, project, region_name, token):
"""
Check if params = ["project", "region_name", "token"]
are correctly translated for = ["azure", "vertex_ai", "watsonx", "aws"]
tests get_optional_params
"""
data = {
"project": project,
"region_name": region_name,
"token": token,
"custom_llm_provider": provider,
"model": model,
}
translated_optional_params = litellm.utils.get_optional_params(**data)
if provider == "azure":
special_auth_params = (
litellm.AzureOpenAIConfig().get_mapped_special_auth_params()
)
elif provider == "bedrock":
special_auth_params = (
litellm.AmazonBedrockGlobalConfig().get_mapped_special_auth_params()
)
elif provider == "vertex_ai":
special_auth_params = litellm.VertexAIConfig().get_mapped_special_auth_params()
elif provider == "watsonx":
special_auth_params = (
litellm.IBMWatsonXAIConfig().get_mapped_special_auth_params()
)
for param, value in special_auth_params.items():
assert param in data
assert value in translated_optional_params
@pytest.mark.asyncio
async def test_acompletion_watsonx():
litellm.set_verbose = True
model_name = "watsonx/ibm/granite-13b-chat-v2"
print("testing watsonx")
try:
response = await litellm.acompletion(
model=model_name,
messages=messages,
temperature=0.2,
max_tokens=80,
)
# Add any assertions here to check the response
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_palm_stream()
# test_completion_deep_infra()

View file

@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
completion_response=response, call_type="image_generation"
)
assert cost > 0
def test_replicate_llama3_cost_tracking():
litellm.set_verbose = True
model = "replicate/meta/meta-llama-3-8b-instruct"
litellm.register_model(
{
"replicate/meta/meta-llama-3-8b-instruct": {
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
}
}
)
response = litellm.ModelResponse(
id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
choices=[
litellm.utils.Choices(
finish_reason="stop",
index=0,
message=litellm.utils.Message(
content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
role="assistant",
),
)
],
created=1714401369,
model="replicate/meta/meta-llama-3-8b-instruct",
object="chat.completion",
system_fingerprint=None,
usage=litellm.utils.Usage(
prompt_tokens=48, completion_tokens=31, total_tokens=79
),
)
cost = litellm.completion_cost(
completion_response=response,
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
print(f"cost: {cost}")
cost = round(cost, 5)
expected_cost = round(
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"input_cost_per_token"
]
* 48
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"output_cost_per_token"
]
* 31,
5,
)
assert cost == expected_cost

View file

@ -26,6 +26,9 @@ class DBModel(BaseModel):
model_info: dict
litellm_params: dict
class Config:
protected_namespaces = ()
@pytest.mark.asyncio
async def test_delete_deployment():

View file

@ -529,6 +529,7 @@ def test_chat_bedrock_stream():
@pytest.mark.asyncio
async def test_async_chat_bedrock_stream():
try:
litellm.set_verbose = True
customHandler = CompletionCustomHandler()
litellm.callbacks = [customHandler]
response = await litellm.acompletion(

View file

@ -484,6 +484,20 @@ def test_mistral_embeddings():
pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="local test")
def test_watsonx_embeddings():
try:
litellm.set_verbose = True
response = litellm.embedding(
model="watsonx/ibm/slate-30m-english-rtrvr",
input=["good morning from litellm"],
)
print(f"response: {response}")
assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_mistral_embeddings()

View file

@ -25,7 +25,7 @@ def test_empty_content():
pass
function_setup(
original_function=completion,
original_function="completion",
rules_obj=rules_obj,
start_time=datetime.now(),
messages=[],

View file

@ -136,8 +136,8 @@ def test_image_generation_bedrock():
litellm.set_verbose = True
response = litellm.image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
aws_region_name="us-east-1",
model="bedrock/stability.stable-diffusion-xl-v1",
aws_region_name="us-west-2",
)
print(f"response: {response}")
except litellm.RateLimitError as e:
@ -156,8 +156,8 @@ async def test_aimage_generation_bedrock_with_optional_params():
try:
response = await litellm.aimage_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
size="128x128",
model="bedrock/stability.stable-diffusion-xl-v1",
size="256x256",
)
print(f"response: {response}")
except litellm.RateLimitError as e:

View file

@ -201,6 +201,7 @@ async def test_router_atext_completion_streaming():
@pytest.mark.asyncio
async def test_router_completion_streaming():
litellm.set_verbose = True
messages = [
{"role": "user", "content": "Hello, can you generate a 500 words poem?"}
]
@ -219,9 +220,9 @@ async def test_router_completion_streaming():
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
@ -229,9 +230,9 @@ async def test_router_completion_streaming():
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_CANADA_API_KEY",
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 3},
@ -262,4 +263,4 @@ async def test_router_completion_streaming():
## check if calls equally distributed
cache_dict = router.cache.get_cache(key=cache_key)
for k, v in cache_dict.items():
assert v == 1
assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"

View file

@ -555,3 +555,171 @@ async def test_lowest_latency_routing_with_timeouts():
# ALL the Requests should have been routed to the fast-endpoint
assert deployments["fast-endpoint"] == 10
@pytest.mark.asyncio
async def test_lowest_latency_routing_first_pick():
"""
PROD Test:
- When all deployments are latency=0, it should randomly pick a deployment
- IT SHOULD NEVER PICK THE Very First deployment everytime all deployment latencies are 0
- This ensures that after the ttl window resets it randomly picks a deployment
"""
import litellm
litellm.set_verbose = True
router = Router(
model_list=[
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-2"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-3"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-4"},
},
],
routing_strategy="latency-based-routing",
routing_strategy_args={"ttl": 0.0000000001},
set_verbose=True,
debug_level="DEBUG",
) # type: ignore
deployments = {}
for _ in range(5):
response = await router.acompletion(
model="azure-model", messages=[{"role": "user", "content": "hello"}]
)
print(response)
_picked_model_id = response._hidden_params["model_id"]
if _picked_model_id not in deployments:
deployments[_picked_model_id] = 1
else:
deployments[_picked_model_id] += 1
await asyncio.sleep(0.000000000005)
print("deployments", deployments)
# assert that len(deployments) >1
assert len(deployments) > 1
@pytest.mark.parametrize("buffer", [0, 1])
@pytest.mark.asyncio
async def test_lowest_latency_routing_buffer(buffer):
"""
Allow shuffling calls within a certain latency buffer
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
routing_strategy="latency-based-routing",
set_verbose=False,
num_retries=3,
routing_strategy_args={"lowest_latency_buffer": buffer},
) # type: ignore
## DEPLOYMENT 1 ##
deployment_id = 1
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 1},
}
}
start_time = time.time()
response_obj = {"usage": {"total_tokens": 50}}
time.sleep(3)
end_time = time.time()
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## DEPLOYMENT 2 ##
deployment_id = 2
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 2},
}
}
start_time = time.time()
response_obj = {"usage": {"total_tokens": 20}}
time.sleep(2)
end_time = time.time()
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## CHECK WHAT'S SELECTED ##
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
selected_deployments = {}
for _ in range(50):
print(router.get_available_deployment(model="azure-model"))
selected_deployments[
router.get_available_deployment(model="azure-model")["model_info"]["id"]
] = 1
if buffer == 0:
assert len(selected_deployments.keys()) == 1
else:
assert len(selected_deployments.keys()) == 2

View file

@ -0,0 +1,10 @@
import warnings
import pytest
def test_namespace_conflict_warning():
with warnings.catch_warnings(record=True) as recorded_warnings:
warnings.simplefilter("always") # Capture all warnings
import litellm
# Check that no warning with the specific message was raised
assert not any("conflict with protected namespace" in str(w.message) for w in recorded_warnings), "Test failed: 'conflict with protected namespace' warning was encountered!"

View file

@ -1,7 +1,7 @@
#### What this tests ####
# This tests litellm router
import sys, os, time
import sys, os, time, openai
import traceback, asyncio
import pytest
@ -14,10 +14,169 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from dotenv import load_dotenv
import os, httpx
load_dotenv()
@pytest.mark.parametrize("num_retries", [None, 2])
@pytest.mark.parametrize("max_retries", [None, 4])
def test_router_num_retries_init(num_retries, max_retries):
"""
- test when num_retries set v/s not
- test client value when max retries set v/s not
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"max_retries": max_retries,
},
"model_info": {"id": 12345},
},
],
num_retries=num_retries,
)
if num_retries is not None:
assert router.num_retries == num_retries
else:
assert router.num_retries == openai.DEFAULT_MAX_RETRIES
model_client = router._get_client(
{"model_info": {"id": 12345}}, client_type="async", kwargs={}
)
if max_retries is not None:
assert getattr(model_client, "max_retries") == max_retries
else:
assert getattr(model_client, "max_retries") == 0
@pytest.mark.parametrize(
"timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
)
@pytest.mark.parametrize("ssl_verify", [True, False])
def test_router_timeout_init(timeout, ssl_verify):
"""
Allow user to pass httpx.Timeout
related issue - https://github.com/BerriAI/litellm/issues/3162
"""
litellm.ssl_verify = ssl_verify
router = Router(
model_list=[
{
"model_name": "test-model",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
"timeout": timeout,
},
"model_info": {"id": 1234},
}
]
)
model_client = router._get_client(
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
)
assert getattr(model_client, "timeout") == timeout
print(f"vars model_client: {vars(model_client)}")
http_client = getattr(model_client, "_client")
print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
if ssl_verify == False:
assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
else:
assert (
http_client._transport._pool._ssl_context.verify_mode.name
== "CERT_REQUIRED"
)
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
async def test_router_retries(sync_mode):
"""
- make sure retries work as expected
"""
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad-key"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
},
},
]
router = Router(model_list=model_list, num_retries=2)
if sync_mode:
router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
else:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
@pytest.mark.parametrize(
"mistral_api_base",
[
"os.environ/AZURE_MISTRAL_API_BASE",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
],
)
def test_router_azure_ai_studio_init(mistral_api_base):
router = Router(
model_list=[
{
"model_name": "test-model",
"litellm_params": {
"model": "azure/mistral-large-latest",
"api_key": "os.environ/AZURE_MISTRAL_API_KEY",
"api_base": mistral_api_base,
},
"model_info": {"id": 1234},
}
]
)
model_client = router._get_client(
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
)
url = getattr(model_client, "_base_url")
uri_reference = str(getattr(url, "_uri_reference"))
print(f"uri_reference: {uri_reference}")
assert "/v1/" in uri_reference
assert uri_reference.count("v1") == 1
def test_exception_raising():
# this tests if the router raises an exception when invalid params are set
# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
@ -995,6 +1154,7 @@ def test_consistent_model_id():
assert id1 == id2
@pytest.mark.skip(reason="local test")
def test_reading_keys_os_environ():
import openai
@ -1094,6 +1254,7 @@ def test_reading_keys_os_environ():
# test_reading_keys_os_environ()
@pytest.mark.skip(reason="local test")
def test_reading_openai_keys_os_environ():
import openai

View file

@ -46,6 +46,7 @@ def test_async_fallbacks(caplog):
router = Router(
model_list=model_list,
fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
num_retries=1,
)
user_message = "Hello, how are you?"
@ -81,8 +82,8 @@ def test_async_fallbacks(caplog):
# Define the expected log messages
# - error request, falling back notice, success notice
expected_logs = [
"Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
"litellm.acompletion(model=None)\x1b[31m Exception No deployments available for selected model, passed model=gpt-3.5-turbo\x1b[0m",
"Falling back to model_group = azure/gpt-3.5-turbo",
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
]

View file

@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger):
def log_pre_api_call(self, model, messages, kwargs):
print(f"Pre-API Call")
print(
f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
)
self.previous_models += len(
kwargs["litellm_params"]["metadata"]["previous_models"]
self.previous_models = len(
kwargs["litellm_params"]["metadata"].get("previous_models", [])
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
print(f"self.previous_models: {self.previous_models}")
@ -127,7 +127,7 @@ def test_sync_fallbacks():
response = router.completion(**kwargs)
print(f"response: {response}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4
print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
router.reset()
@ -140,7 +140,7 @@ def test_sync_fallbacks():
@pytest.mark.asyncio
async def test_async_fallbacks():
litellm.set_verbose = False
litellm.set_verbose = True
model_list = [
{ # list of model deployments
"model_name": "azure/gpt-3.5-turbo", # openai model name
@ -209,12 +209,13 @@ async def test_async_fallbacks():
user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}]
try:
kwargs["model"] = "azure/gpt-3.5-turbo"
response = await router.acompletion(**kwargs)
print(f"customHandler.previous_models: {customHandler.previous_models}")
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except litellm.Timeout as e:
pass
@ -268,7 +269,7 @@ def test_sync_fallbacks_embeddings():
response = router.embedding(**kwargs)
print(f"customHandler.previous_models: {customHandler.previous_models}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except litellm.Timeout as e:
pass
@ -322,7 +323,7 @@ async def test_async_fallbacks_embeddings():
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except litellm.Timeout as e:
pass
@ -401,7 +402,7 @@ def test_dynamic_fallbacks_sync():
response = router.completion(**kwargs)
print(f"response: {response}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
@ -487,7 +488,7 @@ async def test_dynamic_fallbacks_async():
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
@ -572,7 +573,7 @@ async def test_async_fallbacks_streaming():
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback
assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset()
except litellm.Timeout as e:
pass
@ -751,7 +752,7 @@ async def test_async_fallbacks_max_retries_per_request():
router.reset()
def test_usage_based_routing_fallbacks():
def test_ausage_based_routing_fallbacks():
try:
# [Prod Test]
# IT tests Usage Based Routing with fallbacks
@ -765,10 +766,10 @@ def test_usage_based_routing_fallbacks():
load_dotenv()
# Constants for TPM and RPM allocation
AZURE_FAST_TPM = 3
AZURE_BASIC_TPM = 4
OPENAI_TPM = 400
ANTHROPIC_TPM = 100000
AZURE_FAST_RPM = 1
AZURE_BASIC_RPM = 1
OPENAI_RPM = 2
ANTHROPIC_RPM = 100000
def get_azure_params(deployment_name: str):
params = {
@ -797,22 +798,26 @@ def test_usage_based_routing_fallbacks():
{
"model_name": "azure/gpt-4-fast",
"litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_FAST_TPM,
"model_info": {"id": 1},
"rpm": AZURE_FAST_RPM,
},
{
"model_name": "azure/gpt-4-basic",
"litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_BASIC_TPM,
"model_info": {"id": 2},
"rpm": AZURE_BASIC_RPM,
},
{
"model_name": "openai-gpt-4",
"litellm_params": get_openai_params("gpt-3.5-turbo"),
"tpm": OPENAI_TPM,
"model_info": {"id": 3},
"rpm": OPENAI_RPM,
},
{
"model_name": "anthropic-claude-instant-1.2",
"litellm_params": get_anthropic_params("claude-instant-1.2"),
"tpm": ANTHROPIC_TPM,
"model_info": {"id": 4},
"rpm": ANTHROPIC_RPM,
},
]
# litellm.set_verbose=True
@ -830,6 +835,7 @@ def test_usage_based_routing_fallbacks():
routing_strategy="usage-based-routing",
redis_host=os.environ["REDIS_HOST"],
redis_port=os.environ["REDIS_PORT"],
num_retries=0,
)
messages = [
@ -842,10 +848,10 @@ def test_usage_based_routing_fallbacks():
mock_response="very nice to meet you",
)
print("response: ", response)
print("response._hidden_params: ", response._hidden_params)
print(f"response._hidden_params: {response._hidden_params}")
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
assert response._hidden_params["custom_llm_provider"] == "openai"
assert response._hidden_params["model_id"] == "1"
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
for i in range(20):
@ -859,7 +865,7 @@ def test_usage_based_routing_fallbacks():
print("response._hidden_params: ", response._hidden_params)
if i == 19:
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
assert response._hidden_params["custom_llm_provider"] == "anthropic"
assert response._hidden_params["model_id"] == "4"
except Exception as e:
pytest.fail(f"An exception occurred {e}")

View file

@ -203,7 +203,7 @@ def test_timeouts_router():
},
},
]
router = Router(model_list=model_list)
router = Router(model_list=model_list, num_retries=0)
print("PASSED !")
@ -396,7 +396,9 @@ def test_router_init_gpt_4_vision_enhancements():
pytest.fail(f"Error occurred: {e}")
def test_openai_with_organization():
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_openai_with_organization(sync_mode):
try:
print("Testing OpenAI with organization")
model_list = [
@ -418,6 +420,7 @@ def test_openai_with_organization():
print(router.model_list)
print(router.model_list[0])
if sync_mode:
openai_client = router._get_client(
deployment=router.model_list[0],
kwargs={"input": ["hello"], "model": "openai-bad-org"},
@ -433,7 +436,9 @@ def test_openai_with_organization():
model="openai-bad-org",
messages=[{"role": "user", "content": "this is a test"}],
)
pytest.fail("Request should have failed - This organization does not exist")
pytest.fail(
"Request should have failed - This organization does not exist"
)
except Exception as e:
print("Got exception: " + str(e))
assert "No such organization: org-ikDc4ex8NB" in str(e)
@ -444,6 +449,36 @@ def test_openai_with_organization():
messages=[{"role": "user", "content": "this is a test"}],
max_tokens=5,
)
else:
openai_client = router._get_client(
deployment=router.model_list[0],
kwargs={"input": ["hello"], "model": "openai-bad-org"},
client_type="async",
)
print(vars(openai_client))
assert openai_client.organization == "org-ikDc4ex8NB"
# bad org raises error
try:
response = await router.acompletion(
model="openai-bad-org",
messages=[{"role": "user", "content": "this is a test"}],
)
pytest.fail(
"Request should have failed - This organization does not exist"
)
except Exception as e:
print("Got exception: " + str(e))
assert "No such organization: org-ikDc4ex8NB" in str(e)
# good org works
response = await router.acompletion(
model="openai-good-org",
messages=[{"role": "user", "content": "this is a test"}],
max_tokens=5,
)
except Exception as e:
pytest.fail(f"Error occurred: {e}")

View file

@ -0,0 +1,121 @@
#### What this tests ####
# This tests calling router with fallback models
import sys, os, time
import traceback, asyncio
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from litellm import Router
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
success: bool = False
failure: bool = False
previous_models: int = 0
def log_pre_api_call(self, model, messages, kwargs):
print(f"Pre-API Call")
print(
f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
)
self.previous_models = len(
kwargs["litellm_params"]["metadata"].get("previous_models", [])
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
print(f"self.previous_models: {self.previous_models}")
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
print(
f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}"
)
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
"""
Test sync + async
- Authorization Errors
- Random API Error
"""
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"])
@pytest.mark.asyncio
async def test_router_retries_errors(sync_mode, error_type):
"""
- Auth Error -> 0 retries
- API Error -> 2 retries
"""
_api_key = (
"bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY")
)
print(f"_api_key: {_api_key}")
model_list = [
{
"model_name": "azure/gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": _api_key,
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 240000,
"rpm": 1800,
},
]
router = Router(model_list=model_list, allowed_fails=3)
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}]
kwargs = {
"model": "azure/gpt-3.5-turbo",
"messages": messages,
"mock_response": (
None
if error_type == "Authorization Error"
else Exception("Invalid Request")
),
}
try:
if sync_mode:
response = router.completion(**kwargs)
else:
response = await router.acompletion(**kwargs)
except Exception as e:
pass
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
print(f"customHandler.previous_models: {customHandler.previous_models}")
if error_type == "Authorization Error":
assert customHandler.previous_models == 0 # 0 retries
else:
assert customHandler.previous_models == 2 # 2 retries

View file

@ -57,6 +57,7 @@ def test_router_timeouts():
redis_password=os.getenv("REDIS_PASSWORD"),
redis_port=int(os.getenv("REDIS_PORT")),
timeout=10,
num_retries=0,
)
print("***** TPM SETTINGS *****")
@ -89,15 +90,15 @@ def test_router_timeouts():
@pytest.mark.asyncio
async def test_router_timeouts_bedrock():
import openai
import openai, uuid
# Model list for OpenAI and Anthropic models
model_list = [
_model_list = [
{
"model_name": "bedrock",
"litellm_params": {
"model": "bedrock/anthropic.claude-instant-v1",
"timeout": 0.001,
"timeout": 0.00001,
},
"tpm": 80000,
},
@ -105,17 +106,18 @@ async def test_router_timeouts_bedrock():
# Configure router
router = Router(
model_list=model_list,
model_list=_model_list,
routing_strategy="usage-based-routing",
debug_level="DEBUG",
set_verbose=True,
num_retries=0,
)
litellm.set_verbose = True
try:
response = await router.acompletion(
model="bedrock",
messages=[{"role": "user", "content": "hello, who are u"}],
messages=[{"role": "user", "content": f"hello, who are u {uuid.uuid4()}"}],
)
print(response)
pytest.fail("Did not raise error `openai.APITimeoutError`")

View file

@ -518,7 +518,7 @@ async def test_acompletion_gemini_stream():
litellm.set_verbose = True
print("Streaming gemini response")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
# {"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "What do you know?",
@ -1271,6 +1271,33 @@ def test_completion_sagemaker_stream():
pytest.fail(f"Error occurred: {e}")
def test_completion_watsonx_stream():
litellm.set_verbose = True
try:
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=messages,
temperature=0.5,
max_tokens=20,
stream=True,
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_sagemaker_stream()
@ -2446,6 +2473,34 @@ class ModelResponseIterator:
return self.model_response
class ModelResponseListIterator:
def __init__(self, model_responses):
self.model_responses = model_responses
self.index = 0
# Sync iterator
def __iter__(self):
return self
def __next__(self):
if self.index >= len(self.model_responses):
raise StopIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
# Async iterator
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.model_responses):
raise StopAsyncIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
def test_unit_test_custom_stream_wrapper():
"""
Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2541,268 @@ def test_unit_test_custom_stream_wrapper():
if "How are you?" in chunk.choices[0].delta.content:
freq += 1
assert freq == 1
def test_aamazing_unit_test_custom_stream_wrapper_n():
"""
Test if the translated output maps exactly to the received openai input
Relevant issue: https://github.com/BerriAI/litellm/issues/3276
"""
chunks = [
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "It"},
"logprobs": {
"content": [
{
"token": "It",
"logprob": -1.5952516,
"bytes": [73, 116],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 1,
"delta": {"content": "Brown"},
"logprobs": {
"content": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "'s"},
"logprobs": {
"content": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
"top_logprobs": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": " impossible"},
"logprobs": {
"content": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
"top_logprobs": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "—even"},
"logprobs": {
"content": [
{
"token": "—even",
"logprob": -9999.0,
"bytes": [226, 128, 148, 101, 118, 101, 110],
"top_logprobs": [
{
"token": " to",
"logprob": -0.12302828,
"bytes": [32, 116, 111],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
],
},
]
litellm.set_verbose = True
chunk_list = []
for chunk in chunks:
new_chunk = litellm.ModelResponse(stream=True, id=chunk["id"])
if "choices" in chunk and isinstance(chunk["choices"], list):
print("INSIDE CHUNK CHOICES!")
new_choices = []
for choice in chunk["choices"]:
if isinstance(choice, litellm.utils.StreamingChoices):
_new_choice = choice
elif isinstance(choice, dict):
_new_choice = litellm.utils.StreamingChoices(**choice)
new_choices.append(_new_choice)
new_chunk.choices = new_choices
chunk_list.append(new_chunk)
completion_stream = ModelResponseListIterator(model_responses=chunk_list)
response = litellm.CustomStreamWrapper(
completion_stream=completion_stream,
model="gpt-4-0613",
custom_llm_provider="cached_response",
logging_obj=litellm.Logging(
model="gpt-4-0613",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
),
)
for idx, chunk in enumerate(response):
chunk_dict = {}
try:
chunk_dict = chunk.model_dump(exclude_none=True)
except:
chunk_dict = chunk.dict(exclude_none=True)
chunk_dict.pop("created")
chunks[idx].pop("created")
if chunks[idx]["system_fingerprint"] is None:
chunks[idx].pop("system_fingerprint", None)
if idx == 0:
for choice in chunk_dict["choices"]:
if "role" in choice["delta"]:
choice["delta"].pop("role")
for choice in chunks[idx]["choices"]:
# ignore finish reason None - since our pydantic object is set to exclude_none = true
if "finish_reason" in choice and choice["finish_reason"] is None:
choice.pop("finish_reason")
if "logprobs" in choice and choice["logprobs"] is None:
choice.pop("logprobs")
assert (
chunk_dict == chunks[idx]
), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"

View file

@ -78,7 +78,8 @@ def test_hanging_request_azure():
"model_name": "openai-gpt",
"litellm_params": {"model": "gpt-3.5-turbo"},
},
]
],
num_retries=0,
)
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -131,7 +132,8 @@ def test_hanging_request_openai():
"model_name": "openai-gpt",
"litellm_params": {"model": "gpt-3.5-turbo"},
},
]
],
num_retries=0,
)
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -189,6 +191,7 @@ def test_timeout_streaming():
# test_timeout_streaming()
@pytest.mark.skip(reason="local test")
def test_timeout_ollama():
# this Will Raise a timeout
import litellm

View file

@ -282,6 +282,64 @@ def test_router_skip_rate_limited_deployments():
print(f"An exception occurred! {str(e)}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_multiple_potential_deployments(sync_mode):
"""
If multiple deployments have the same tpm value
call 5 times, test if deployments are shuffled.
-> prevents single deployment from being overloaded in high-concurrency scenario
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"tpm": 1440,
},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo-2",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"tpm": 1440,
},
},
]
router = Router(
model_list=model_list,
routing_strategy="usage-based-routing-v2",
set_verbose=False,
num_retries=3,
) # type: ignore
model_ids = set()
for _ in range(1000):
if sync_mode:
deployment = router.get_available_deployment(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
else:
deployment = await router.async_get_available_deployment(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
## get id ##
id = deployment.get("model_info", {}).get("id")
model_ids.add(id)
assert len(model_ids) == 2
def test_single_deployment_tpm_zero():
import litellm
import os

View file

@ -1,5 +1,5 @@
from typing import List, Optional, Union, Dict, Tuple, Literal
import httpx
from pydantic import BaseModel, validator
from .completion import CompletionRequest
from .embedding import EmbeddingRequest
@ -104,11 +104,13 @@ class LiteLLM_Params(BaseModel):
api_key: Optional[str] = None
api_base: Optional[str] = None
api_version: Optional[str] = None
timeout: Optional[Union[float, str]] = None # if str, pass in as os.environ/
timeout: Optional[Union[float, str, httpx.Timeout]] = (
None # if str, pass in as os.environ/
)
stream_timeout: Optional[Union[float, str]] = (
None # timeout when making stream=True calls, if str, pass in as os.environ/
)
max_retries: int = 2 # follows openai default of 2
max_retries: Optional[int] = None
organization: Optional[str] = None # for openai orgs
## VERTEX AI ##
vertex_project: Optional[str] = None
@ -146,14 +148,13 @@ class LiteLLM_Params(BaseModel):
args.pop("self", None)
args.pop("params", None)
args.pop("__class__", None)
if max_retries is None:
max_retries = 2
elif isinstance(max_retries, str):
if max_retries is not None and isinstance(max_retries, str):
max_retries = int(max_retries) # cast to int
super().__init__(max_retries=max_retries, **args, **params)
class Config:
extra = "allow"
arbitrary_types_allowed = True
def __contains__(self, key):
# Define custom behavior for the 'in' operator
@ -201,6 +202,9 @@ class updateDeployment(BaseModel):
litellm_params: Optional[updateLiteLLMParams] = None
model_info: Optional[ModelInfo] = None
class Config:
protected_namespaces = ()
class Deployment(BaseModel):
model_name: str
@ -259,3 +263,4 @@ class RouterErrors(enum.Enum):
"""
user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
no_deployments_available = "No deployments available for selected model"

View file

@ -19,6 +19,7 @@ from functools import wraps
import datetime, time
import tiktoken
import uuid
from pydantic import BaseModel
import aiohttp
import textwrap
import logging
@ -69,6 +70,7 @@ from .integrations.langsmith import LangsmithLogger
from .integrations.weights_biases import WeightsBiasesLogger
from .integrations.custom_logger import CustomLogger
from .integrations.langfuse import LangFuseLogger
from .integrations.openmeter import OpenMeterLogger
from .integrations.datadog import DataDogLogger
from .integrations.prometheus import PrometheusLogger
from .integrations.prometheus_services import PrometheusServicesLogger
@ -105,7 +107,7 @@ try:
except Exception as e:
verbose_logger.debug(f"Exception import enterprise features {str(e)}")
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO, Iterable
from .caching import Cache
from concurrent.futures import ThreadPoolExecutor
@ -129,6 +131,7 @@ langsmithLogger = None
weightsBiasesLogger = None
customLogger = None
langFuseLogger = None
openMeterLogger = None
dataDogLogger = None
prometheusLogger = None
dynamoLogger = None
@ -219,6 +222,61 @@ def map_finish_reason(
return finish_reason
class TopLogprob(OpenAIObject):
token: str
"""The token."""
bytes: Optional[List[int]] = None
"""A list of integers representing the UTF-8 bytes representation of the token.
Useful in instances where characters are represented by multiple tokens and
their byte representations must be combined to generate the correct text
representation. Can be `null` if there is no bytes representation for the token.
"""
logprob: float
"""The log probability of this token, if it is within the top 20 most likely
tokens.
Otherwise, the value `-9999.0` is used to signify that the token is very
unlikely.
"""
class ChatCompletionTokenLogprob(OpenAIObject):
token: str
"""The token."""
bytes: Optional[List[int]] = None
"""A list of integers representing the UTF-8 bytes representation of the token.
Useful in instances where characters are represented by multiple tokens and
their byte representations must be combined to generate the correct text
representation. Can be `null` if there is no bytes representation for the token.
"""
logprob: float
"""The log probability of this token, if it is within the top 20 most likely
tokens.
Otherwise, the value `-9999.0` is used to signify that the token is very
unlikely.
"""
top_logprobs: List[TopLogprob]
"""List of the most likely tokens and their log probability, at this token
position.
In rare cases, there may be fewer than the number of requested `top_logprobs`
returned.
"""
class ChoiceLogprobs(OpenAIObject):
content: Optional[List[ChatCompletionTokenLogprob]] = None
"""A list of message content tokens with log probability information."""
class FunctionCall(OpenAIObject):
arguments: str
name: Optional[str] = None
@ -320,19 +378,19 @@ class Message(OpenAIObject):
super(Message, self).__init__(**params)
self.content = content
self.role = role
self.tool_calls = None
self.function_call = None
if function_call is not None:
self.function_call = FunctionCall(**function_call)
if tool_calls is not None:
self.tool_calls = []
for tool_call in tool_calls:
if isinstance(tool_call, dict):
self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
else:
self.tool_calls.append(tool_call)
self.tool_calls = [
ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
]
if logprobs is not None:
self._logprobs = logprobs
self._logprobs = ChoiceLogprobs(**logprobs)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -355,12 +413,20 @@ class Message(OpenAIObject):
class Delta(OpenAIObject):
tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
def __init__(
self, content=None, role=None, function_call=None, tool_calls=None, **params
self,
content=None,
role=None,
function_call=None,
tool_calls=None,
**params,
):
super(Delta, self).__init__(**params)
self.content = content
self.role = role
if function_call is not None and isinstance(function_call, dict):
self.function_call = FunctionCall(**function_call)
else:
@ -410,7 +476,7 @@ class Choices(OpenAIObject):
) # set finish_reason for all responses
self.index = index
if message is None:
self.message = Message(content=None)
self.message = Message()
else:
if isinstance(message, Message):
self.message = message
@ -492,7 +558,11 @@ class StreamingChoices(OpenAIObject):
self.delta = Delta()
if enhancements is not None:
self.enhancements = enhancements
self.logprobs = logprobs
if logprobs is not None and isinstance(logprobs, dict):
self.logprobs = ChoiceLogprobs(**logprobs)
else:
self.logprobs = logprobs # type: ignore
def __contains__(self, key):
# Define custom behavior for the 'in' operator
@ -1139,6 +1209,13 @@ class Logging:
if verbose_logger.level == 0:
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
print_verbose(f"\033[92m{curl_command}\033[0m\n")
if litellm.json_logs:
verbose_logger.info(
"POST Request Sent from LiteLLM",
extra={"api_base": {api_base}, **masked_headers},
)
else:
verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
if self.logger_fn and callable(self.logger_fn):
try:
@ -1149,7 +1226,6 @@ class Logging:
print_verbose(
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
)
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
callbacks = litellm.input_callback + self.dynamic_input_callbacks
for callback in callbacks:
@ -1166,29 +1242,20 @@ class Logging:
litellm_call_id=self.litellm_params["litellm_call_id"],
print_verbose=print_verbose,
)
elif callback == "lite_debugger":
print_verbose(
f"reaches litedebugger for logging! - model_call_details {self.model_call_details}"
)
model = self.model_call_details["model"]
messages = self.model_call_details["input"]
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
liteDebuggerClient.input_log_event(
model=model,
messages=messages,
end_user=self.model_call_details.get("user", "default"),
litellm_call_id=self.litellm_params["litellm_call_id"],
litellm_params=self.model_call_details["litellm_params"],
optional_params=self.model_call_details["optional_params"],
print_verbose=print_verbose,
call_type=self.call_type,
)
elif callback == "sentry" and add_breadcrumb:
print_verbose("reaches sentry breadcrumbing")
try:
details_to_log = copy.deepcopy(self.model_call_details)
except:
details_to_log = self.model_call_details
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb(
category="litellm.llm_call",
message=f"Model Call Details pre-call: {self.model_call_details}",
message=f"Model Call Details pre-call: {details_to_log}",
level="info",
)
elif isinstance(callback, CustomLogger): # custom logger class
@ -1252,7 +1319,7 @@ class Logging:
print_verbose(
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
)
self.redact_message_input_output_from_logging(result=original_response)
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
callbacks = litellm.input_callback + self.dynamic_input_callbacks
@ -1270,9 +1337,19 @@ class Logging:
)
elif callback == "sentry" and add_breadcrumb:
print_verbose("reaches sentry breadcrumbing")
try:
details_to_log = copy.deepcopy(self.model_call_details)
except:
details_to_log = self.model_call_details
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb(
category="litellm.llm_call",
message=f"Model Call Details post-call: {self.model_call_details}",
message=f"Model Call Details post-call: {details_to_log}",
level="info",
)
elif isinstance(callback, CustomLogger): # custom logger class
@ -1464,6 +1541,8 @@ class Logging:
else:
callbacks = litellm.success_callback
self.redact_message_input_output_from_logging(result=result)
for callback in callbacks:
try:
litellm_params = self.model_call_details.get("litellm_params", {})
@ -1850,6 +1929,51 @@ class Logging:
end_time=end_time,
print_verbose=print_verbose,
)
if (
callback == "openmeter"
and self.model_call_details.get("litellm_params", {}).get(
"acompletion", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"aembedding", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"aimage_generation", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"atranscription", False
)
== False
):
global openMeterLogger
if openMeterLogger is None:
print_verbose("Instantiates openmeter client")
openMeterLogger = OpenMeterLogger()
if self.stream and complete_streaming_response is None:
openMeterLogger.log_stream_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
else:
if self.stream and complete_streaming_response:
self.model_call_details["complete_response"] = (
self.model_call_details.get(
"complete_streaming_response", {}
)
)
result = self.model_call_details["complete_response"]
openMeterLogger.log_success_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
if (
isinstance(callback, CustomLogger)
and self.model_call_details.get("litellm_params", {}).get(
@ -2008,7 +2132,9 @@ class Logging:
callbacks.append(callback)
else:
callbacks = litellm._async_success_callback
print_verbose(f"Async success callbacks: {callbacks}")
self.redact_message_input_output_from_logging(result=result)
for callback in callbacks:
# check if callback can run for this request
litellm_params = self.model_call_details.get("litellm_params", {})
@ -2046,6 +2172,35 @@ class Logging:
await litellm.cache.async_add_cache(result, **kwargs)
else:
litellm.cache.add_cache(result, **kwargs)
if callback == "openmeter":
global openMeterLogger
if self.stream == True:
if (
"async_complete_streaming_response"
in self.model_call_details
):
await openMeterLogger.async_log_success_event(
kwargs=self.model_call_details,
response_obj=self.model_call_details[
"async_complete_streaming_response"
],
start_time=start_time,
end_time=end_time,
)
else:
await openMeterLogger.async_log_stream_event( # [TODO]: move this to being an async log stream event function
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
else:
await openMeterLogger.async_log_success_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
if isinstance(callback, CustomLogger): # custom logger class
if self.stream == True:
if (
@ -2169,7 +2324,10 @@ class Logging:
start_time=start_time,
end_time=end_time,
)
result = None # result sent to all loggers, init this to None incase it's not created
self.redact_message_input_output_from_logging(result=result)
for callback in litellm.failure_callback:
try:
if callback == "lite_debugger":
@ -2354,6 +2512,39 @@ class Logging:
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
)
def redact_message_input_output_from_logging(self, result):
"""
Removes messages, prompts, input, response from logging. This modifies the data in-place
only redacts when litellm.turn_off_message_logging == True
"""
# check if user opted out of logging message/response to callbacks
if litellm.turn_off_message_logging == True:
# remove messages, prompts, input, response from logging
self.model_call_details["messages"] = "redacted-by-litellm"
self.model_call_details["prompt"] = ""
self.model_call_details["input"] = ""
# response cleaning
# ChatCompletion Responses
if self.stream and "complete_streaming_response" in self.model_call_details:
_streaming_response = self.model_call_details[
"complete_streaming_response"
]
for choice in _streaming_response.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
else:
if result is not None:
if isinstance(result, litellm.ModelResponse):
if hasattr(result, "choices") and result.choices is not None:
for choice in result.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
def exception_logging(
additional_args={},
@ -2436,7 +2627,7 @@ class Rules:
####### CLIENT ###################
# make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
def function_setup(
original_function, rules_obj, start_time, *args, **kwargs
original_function: str, rules_obj, start_time, *args, **kwargs
): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
try:
global callback_list, add_breadcrumb, user_logger_fn, Logging
@ -2460,10 +2651,12 @@ def function_setup(
len(litellm.input_callback) > 0
or len(litellm.success_callback) > 0
or len(litellm.failure_callback) > 0
) and len(callback_list) == 0:
) and len(
callback_list # type: ignore
) == 0: # type: ignore
callback_list = list(
set(
litellm.input_callback
litellm.input_callback # type: ignore
+ litellm.success_callback
+ litellm.failure_callback
)
@ -2472,7 +2665,7 @@ def function_setup(
## ASYNC CALLBACKS
if len(litellm.input_callback) > 0:
removed_async_items = []
for index, callback in enumerate(litellm.input_callback):
for index, callback in enumerate(litellm.input_callback): # type: ignore
if inspect.iscoroutinefunction(callback):
litellm._async_input_callback.append(callback)
removed_async_items.append(index)
@ -2483,11 +2676,11 @@ def function_setup(
if len(litellm.success_callback) > 0:
removed_async_items = []
for index, callback in enumerate(litellm.success_callback):
for index, callback in enumerate(litellm.success_callback): # type: ignore
if inspect.iscoroutinefunction(callback):
litellm._async_success_callback.append(callback)
removed_async_items.append(index)
elif callback == "dynamodb":
elif callback == "dynamodb" or callback == "openmeter":
# dynamo is an async callback, it's used for the proxy and needs to be async
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
litellm._async_success_callback.append(callback)
@ -2499,7 +2692,7 @@ def function_setup(
if len(litellm.failure_callback) > 0:
removed_async_items = []
for index, callback in enumerate(litellm.failure_callback):
for index, callback in enumerate(litellm.failure_callback): # type: ignore
if inspect.iscoroutinefunction(callback):
litellm._async_failure_callback.append(callback)
removed_async_items.append(index)
@ -2533,16 +2726,26 @@ def function_setup(
dynamic_success_callbacks = kwargs.pop("success_callback")
if add_breadcrumb:
try:
details_to_log = copy.deepcopy(kwargs)
except:
details_to_log = kwargs
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb(
category="litellm.llm_call",
message=f"Positional Args: {args}, Keyword Args: {kwargs}",
message=f"Positional Args: {args}, Keyword Args: {details_to_log}",
level="info",
)
if "logger_fn" in kwargs:
user_logger_fn = kwargs["logger_fn"]
# INIT LOGGER - for user-specified integrations
model = args[0] if len(args) > 0 else kwargs.get("model", None)
call_type = original_function.__name__
call_type = original_function
if (
call_type == CallTypes.completion.value
or call_type == CallTypes.acompletion.value
@ -2724,7 +2927,7 @@ def client(original_function):
try:
if logging_obj is None:
logging_obj, kwargs = function_setup(
original_function, rules_obj, start_time, *args, **kwargs
original_function.__name__, rules_obj, start_time, *args, **kwargs
)
kwargs["litellm_logging_obj"] = logging_obj
@ -3033,7 +3236,7 @@ def client(original_function):
try:
if logging_obj is None:
logging_obj, kwargs = function_setup(
original_function, rules_obj, start_time, *args, **kwargs
original_function.__name__, rules_obj, start_time, *args, **kwargs
)
kwargs["litellm_logging_obj"] = logging_obj
@ -3540,12 +3743,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0:
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = completion_response["ended"]
total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time
return a100_80gb_price_per_second_public * total_time / 1000
def _select_tokenizer(model: str):
@ -3567,7 +3770,7 @@ def _select_tokenizer(model: str):
tokenizer = Tokenizer.from_str(json_str)
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# llama2
elif "llama-2" in model.lower():
elif "llama-2" in model.lower() or "replicate" in model.lower():
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# default - tiktoken
@ -4168,7 +4371,10 @@ def completion_cost(
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif model in litellm.replicate_models or "replicate" in model:
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
(
@ -4554,7 +4760,36 @@ def get_optional_params(
k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
): # allow dynamically setting vertex ai init logic
continue
passed_params[k] = v
optional_params = {}
common_auth_dict = litellm.common_cloud_provider_auth_params
if custom_llm_provider in common_auth_dict["providers"]:
"""
Check if params = ["project", "region_name", "token"]
and correctly translate for = ["azure", "vertex_ai", "watsonx", "aws"]
"""
if custom_llm_provider == "azure":
optional_params = litellm.AzureOpenAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
elif custom_llm_provider == "bedrock":
optional_params = (
litellm.AmazonBedrockGlobalConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
)
elif custom_llm_provider == "vertex_ai":
optional_params = litellm.VertexAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
elif custom_llm_provider == "watsonx":
optional_params = litellm.IBMWatsonXAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
default_params = {
"functions": None,
"function_call": None,
@ -4590,7 +4825,7 @@ def get_optional_params(
and v != default_params[k]
)
}
optional_params = {}
## raise exception if function calling passed in for a provider that doesn't support it
if (
"functions" in non_default_params
@ -5268,7 +5503,8 @@ def get_optional_params(
optional_params["tools"] = tools
if tool_choice is not None:
optional_params["tool_choice"] = tool_choice
if response_format is not None:
optional_params["response_format"] = response_format
# check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
safe_mode = passed_params.pop("safe_mode", None)
random_seed = passed_params.pop("random_seed", None)
@ -5280,6 +5516,7 @@ def get_optional_params(
optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param
)
elif custom_llm_provider == "groq":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
@ -5360,6 +5597,49 @@ def get_optional_params(
optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param
)
elif custom_llm_provider == "watsonx":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
optional_params["max_new_tokens"] = max_tokens
if stream:
optional_params["stream"] = stream
if temperature is not None:
optional_params["temperature"] = temperature
if top_p is not None:
optional_params["top_p"] = top_p
if frequency_penalty is not None:
optional_params["repetition_penalty"] = frequency_penalty
if seed is not None:
optional_params["random_seed"] = seed
if stop is not None:
optional_params["stop_sequences"] = stop
# WatsonX-only parameters
extra_body = {}
if "decoding_method" in passed_params:
extra_body["decoding_method"] = passed_params.pop("decoding_method")
if "min_tokens" in passed_params or "min_new_tokens" in passed_params:
extra_body["min_new_tokens"] = passed_params.pop(
"min_tokens", passed_params.pop("min_new_tokens")
)
if "top_k" in passed_params:
extra_body["top_k"] = passed_params.pop("top_k")
if "truncate_input_tokens" in passed_params:
extra_body["truncate_input_tokens"] = passed_params.pop(
"truncate_input_tokens"
)
if "length_penalty" in passed_params:
extra_body["length_penalty"] = passed_params.pop("length_penalty")
if "time_limit" in passed_params:
extra_body["time_limit"] = passed_params.pop("time_limit")
if "return_options" in passed_params:
extra_body["return_options"] = passed_params.pop("return_options")
optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param
)
else: # assume passing in params for openai/azure openai
print_verbose(
f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
@ -5762,6 +6042,8 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
"frequency_penalty",
"presence_penalty",
]
elif custom_llm_provider == "watsonx":
return litellm.IBMWatsonXAIConfig().get_supported_openai_params()
def get_formatted_prompt(
@ -5989,6 +6271,8 @@ def get_llm_provider(
model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
):
custom_llm_provider = "bedrock"
elif model in litellm.watsonx_models:
custom_llm_provider = "watsonx"
# openai embeddings
elif model in litellm.open_ai_embedding_models:
custom_llm_provider = "openai"
@ -6453,7 +6737,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
keys_in_environment = True
else:
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_LOCATION"])
elif custom_llm_provider == "huggingface":
if "HUGGINGFACE_API_KEY" in os.environ:
keys_in_environment = True
@ -6579,11 +6863,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
def set_callbacks(callback_list, function_id=None):
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
try:
for callback in callback_list:
print_verbose(f"callback: {callback}")
print_verbose(f"init callback list: {callback}")
if callback == "sentry":
try:
import sentry_sdk
@ -6646,6 +6930,8 @@ def set_callbacks(callback_list, function_id=None):
promptLayerLogger = PromptLayerLogger()
elif callback == "langfuse":
langFuseLogger = LangFuseLogger()
elif callback == "openmeter":
openMeterLogger = OpenMeterLogger()
elif callback == "datadog":
dataDogLogger = DataDogLogger()
elif callback == "prometheus":
@ -6982,6 +7268,7 @@ def convert_to_model_response_object(
end_time=None,
hidden_params: Optional[dict] = None,
):
received_args = locals()
try:
if response_type == "completion" and (
model_response_object is None
@ -6993,6 +7280,11 @@ def convert_to_model_response_object(
# for returning cached responses, we need to yield a generator
return convert_to_streaming_response(response_object=response_object)
choice_list = []
assert response_object["choices"] is not None and isinstance(
response_object["choices"], Iterable
)
for idx, choice in enumerate(response_object["choices"]):
message = Message(
content=choice["message"].get("content", None),
@ -7036,6 +7328,7 @@ def convert_to_model_response_object(
model_response_object.model = response_object["model"]
if start_time is not None and end_time is not None:
if isinstance(start_time, type(end_time)):
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000
@ -7113,7 +7406,9 @@ def convert_to_model_response_object(
model_response_object._hidden_params = hidden_params
return model_response_object
except Exception as e:
raise Exception(f"Invalid response object {traceback.format_exc()}")
raise Exception(
f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
)
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
@ -7940,7 +8235,10 @@ def exception_type(
llm_provider="vertex_ai",
response=original_exception.response,
)
elif "None Unknown Error." in error_str:
elif (
"None Unknown Error." in error_str
or "Content has no parts." in error_str
):
exception_mapping_worked = True
raise APIError(
message=f"VertexAIException - {error_str}",
@ -9393,9 +9691,14 @@ class CustomStreamWrapper:
is_finished = True
finish_reason = str_line.choices[0].finish_reason
if finish_reason == "content_filter":
if hasattr(str_line.choices[0], "content_filter_result"):
error_message = json.dumps(
str_line.choices[0].content_filter_result
)
else:
error_message = "Azure Response={}".format(
str(dict(str_line))
)
raise litellm.AzureOpenAIError(
status_code=400, message=error_message
)
@ -9683,6 +9986,39 @@ class CustomStreamWrapper:
"finish_reason": finish_reason,
}
def handle_watsonx_stream(self, chunk):
try:
if isinstance(chunk, dict):
parsed_response = chunk
elif isinstance(chunk, (str, bytes)):
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8")
if "generated_text" in chunk:
response = chunk.replace("data: ", "").strip()
parsed_response = json.loads(response)
else:
return {"text": "", "is_finished": False}
else:
print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
raise ValueError(
f"Unable to parse response. Original response: {chunk}"
)
results = parsed_response.get("results", [])
if len(results) > 0:
text = results[0].get("generated_text", "")
finish_reason = results[0].get("stop_reason")
is_finished = finish_reason != "not_finished"
return {
"text": text,
"is_finished": is_finished,
"finish_reason": finish_reason,
"prompt_tokens": results[0].get("input_token_count", None),
"completion_tokens": results[0].get("generated_token_count", None),
}
return {"text": "", "is_finished": False}
except Exception as e:
raise e
def model_response_creator(self):
model_response = ModelResponse(stream=True, model=self.model)
if self.response_id is not None:
@ -9938,6 +10274,11 @@ class CustomStreamWrapper:
print_verbose(f"completion obj content: {completion_obj['content']}")
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "watsonx":
response_obj = self.handle_watsonx_stream(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "text-completion-openai":
response_obj = self.handle_openai_text_completion_chunk(chunk)
completion_obj["content"] = response_obj["text"]
@ -10123,12 +10464,23 @@ class CustomStreamWrapper:
model_response.id = original_chunk.id
self.response_id = original_chunk.id
if len(original_chunk.choices) > 0:
choices = []
for idx, choice in enumerate(original_chunk.choices):
try:
delta = dict(original_chunk.choices[0].delta)
print_verbose(f"original delta: {delta}")
model_response.choices[0].delta = Delta(**delta)
if isinstance(choice, BaseModel):
try:
choice_json = choice.model_dump()
except Exception as e:
model_response.choices[0].delta = Delta()
choice_json = choice.dict()
choice_json.pop(
"finish_reason", None
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
print_verbose(f"choice_json: {choice_json}")
choices.append(StreamingChoices(**choice_json))
except Exception as e:
choices.append(StreamingChoices())
print_verbose(f"choices in streaming: {choices}")
model_response.choices = choices
else:
return
model_response.system_fingerprint = (
@ -10173,11 +10525,11 @@ class CustomStreamWrapper:
)
self.holding_chunk = ""
# if delta is None
is_delta_empty = self.is_delta_empty(
_is_delta_empty = self.is_delta_empty(
delta=model_response.choices[0].delta
)
if is_delta_empty:
if _is_delta_empty:
# get any function call arguments
model_response.choices[0].finish_reason = map_finish_reason(
finish_reason=self.received_finish_reason

View file

@ -1418,6 +1418,123 @@
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-instruct-v0.2": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.000001,
"litellm_provider": "replicate",
"mode": "chat"
},
"openrouter/openai/gpt-3.5-turbo": {
"max_tokens": 4095,
"input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
"litellm_provider": "openrouter",
"mode": "chat"
},
"openrouter/anthropic/claude-3-opus": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"litellm_provider": "openrouter",
"mode": "chat",
"supports_function_calling": true,
"tool_use_system_prompt_tokens": 395
},
"openrouter/google/palm-2-chat-bison": {
"max_tokens": 8000,
"input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-8b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-70b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77,
"max_input_tokens": 77,

View file

@ -61,14 +61,14 @@ model_list:
api_key: my-fake-key
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
stream_timeout: 0.001
rpm: 10
rpm: 100
- model_name: fake-openai-endpoint-3
litellm_params:
model: openai/my-fake-model-2
api_key: my-fake-key
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
stream_timeout: 0.001
rpm: 10
rpm: 100
- model_name: "*"
litellm_params:
model: openai/*

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.35.27"
version = "1.35.36"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.35.27"
version = "1.35.36"
version_files = [
"pyproject.toml:^version"
]

Some files were not shown because too many files have changed in this diff Show more