Merge branch 'main' into main

This commit is contained in:
Lucca Zenóbio 2024-05-02 09:46:34 -03:00 committed by GitHub
commit 78303b79ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
124 changed files with 6716 additions and 1078 deletions

View file

@ -40,7 +40,7 @@ jobs:
pip install "aioboto3==12.3.0" pip install "aioboto3==12.3.0"
pip install langchain pip install langchain
pip install lunary==0.2.5 pip install lunary==0.2.5
pip install "langfuse==2.7.3" pip install "langfuse==2.27.1"
pip install numpydoc pip install numpydoc
pip install traceloop-sdk==0.0.69 pip install traceloop-sdk==0.0.69
pip install openai pip install openai

1
.gitignore vendored
View file

@ -51,3 +51,4 @@ loadtest_kub.yaml
litellm/proxy/_new_secret_config.yaml litellm/proxy/_new_secret_config.yaml
litellm/proxy/_new_secret_config.yaml litellm/proxy/_new_secret_config.yaml
litellm/proxy/_super_secret_config.yaml litellm/proxy/_super_secret_config.yaml
litellm/proxy/_super_secret_config.yaml

View file

@ -7,7 +7,7 @@ repos:
rev: 7.0.0 # The version of flake8 to use rev: 7.0.0 # The version of flake8 to use
hooks: hooks:
- id: flake8 - id: flake8
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/ exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
additional_dependencies: [flake8-print] additional_dependencies: [flake8-print]
files: litellm/.*\.py files: litellm/.*\.py
- repo: local - repo: local

View file

@ -227,6 +227,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ | | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ | | [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ | | [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ | | [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ | | [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ |

300
cookbook/liteLLM_IBM_Watsonx.ipynb vendored Normal file

File diff suppressed because one or more lines are too long

View file

@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
response = completion("command-nightly", messages) response = completion("command-nightly", messages)
``` ```
## JSON Logs
If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
We currently just log the raw POST request from litellm as a JSON - [**See Code**].
[Share feedback here](https://github.com/BerriAI/litellm/issues)
## Logger Function ## Logger Function
But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set? But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set?

View file

@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())
``` ```
## Multi-Instance TPM/RPM Load Test (Router)
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on router = 200 requests per minute (2 deployments)
- Load we'll send through router = 600 requests per minute
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### Code
Let's hit the router with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
```python
from litellm import Router
import litellm
litellm.suppress_debug_info = True
litellm.set_verbose = False
import logging
logging.basicConfig(level=logging.CRITICAL)
import os, random, uuid, time, asyncio
# Model list for OpenAI and Anthropic models
model_list = [
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8080",
"rpm": 100
},
},
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8081",
"rpm": 100
},
},
]
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
async def router_completion_non_streaming():
try:
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.acompletion(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [router_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
## Multi-Instance TPM/RPM Load Test (Proxy)
Test if your defined tpm/rpm limits are respected across multiple instances.
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
- Load we'll send to proxy = 600 requests per minute
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### 1. Setup config
```yaml
model_list:
- litellm_params:
api_base: http://0.0.0.0:8080
api_key: my-fake-key
model: openai/my-fake-model
rpm: 100
model_name: fake-openai-endpoint
- litellm_params:
api_base: http://0.0.0.0:8081
api_key: my-fake-key
model: openai/my-fake-model-2
rpm: 100
model_name: fake-openai-endpoint
router_settings:
num_retries: 0
enable_pre_call_checks: true
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
routing_strategy: usage-based-routing-v2
```
### 2. Start proxy 2 instances
**Instance 1**
```bash
litellm --config /path/to/config.yaml --port 4000
## RUNNING on http://0.0.0.0:4000
```
**Instance 2**
```bash
litellm --config /path/to/config.yaml --port 4001
## RUNNING on http://0.0.0.0:4001
```
### 3. Run Test
Let's hit the proxy with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
```python
from openai import AsyncOpenAI, AsyncAzureOpenAI
import random, uuid
import time, asyncio, litellm
# import logging
# logging.basicConfig(level=logging.DEBUG)
#### LITELLM PROXY ####
litellm_client = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4000"
)
litellm_client_2 = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4001"
)
async def proxy_completion_non_streaming():
try:
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.chat.completions.create(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [proxy_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
### Extra - Setup Fake OpenAI Server
Let's setup a fake openai server with a RPM limit of 100.
Let's call our file `fake_openai_server.py`.
```
# import sys, os
# sys.path.insert(
# 0, os.path.abspath("../")
# ) # Adds the parent directory to the system path
from fastapi import FastAPI, Request, status, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import OAuth2PasswordBearer
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
import httpx, os, json
from openai import AsyncOpenAI
from typing import Optional
from slowapi import Limiter
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
class ProxyException(Exception):
# NOTE: DO NOT MODIFY THIS
# This is used to map exactly to OPENAI Exceptions
def __init__(
self,
message: str,
type: str,
param: Optional[str],
code: Optional[int],
):
self.message = message
self.type = type
self.param = param
self.code = code
def to_dict(self) -> dict:
"""Converts the ProxyException instance to a dictionary."""
return {
"message": self.message,
"type": self.type,
"param": self.param,
"code": self.code,
}
limiter = Limiter(key_func=get_remote_address)
app = FastAPI()
app.state.limiter = limiter
@app.exception_handler(RateLimitExceeded)
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
return JSONResponse(status_code=429,
content={"detail": "Rate Limited!"})
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
@limiter.limit("100/minute")
async def completion(request: Request):
# raise HTTPException(status_code=429, detail="Rate Limited!")
return {
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": None,
"system_fingerprint": "fp_44709d6fcb",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "\n\nHello there, how may I assist you today?",
},
"logprobs": None,
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 9,
"completion_tokens": 12,
"total_tokens": 21
}
}
if __name__ == "__main__":
import socket
import uvicorn
port = 8080
while True:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
print(f"Port {port} is available, starting server...")
break
else:
port += 1
uvicorn.run(app, host="0.0.0.0", port=port)
```
```bash
python3 fake_openai_server.py
```

View file

@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
## Examples ## Examples
### Custom Callback to track costs for Streaming + Non-Streaming ### Custom Callback to track costs for Streaming + Non-Streaming
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
```python ```python
# Step 1. Write your custom callback function
def track_cost_callback( def track_cost_callback(
kwargs, # kwargs to completion kwargs, # kwargs to completion
completion_response, # response from completion completion_response, # response from completion
start_time, end_time # start/end time start_time, end_time # start/end time
): ):
try: try:
# init logging config response_cost = kwargs["response_cost"] # litellm calculates response cost for you
logging.basicConfig( print("regular response_cost", response_cost)
filename='cost.log',
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# check if it has collected an entire stream response
if "complete_streaming_response" in kwargs:
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
completion_response=kwargs["complete_streaming_response"]
input_text = kwargs["messages"]
output_text = completion_response["choices"][0]["message"]["content"]
response_cost = litellm.completion_cost(
model = kwargs["model"],
messages = input_text,
completion=output_text
)
print("streaming response_cost", response_cost)
logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
# for non streaming responses
else:
# we pass the completion_response obj
if kwargs["stream"] != True:
response_cost = litellm.completion_cost(completion_response=completion_response)
print("regular response_cost", response_cost)
logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
except: except:
pass pass
# Assign the custom callback function # Step 2. Assign the custom callback function
litellm.success_callback = [track_cost_callback] litellm.success_callback = [track_cost_callback]
# Step 3. Make litellm.completion call
response = completion( response = completion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[ messages=[

View file

@ -121,10 +121,12 @@ response = completion(
metadata={ metadata={
"generation_name": "ishaan-test-generation", # set langfuse Generation Name "generation_name": "ishaan-test-generation", # set langfuse Generation Name
"generation_id": "gen-id22", # set langfuse Generation ID "generation_id": "gen-id22", # set langfuse Generation ID
"trace_id": "trace-id22", # set langfuse Trace ID
"trace_user_id": "user-id2", # set langfuse Trace User ID "trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID "session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"] # set langfuse Tags "tags": ["tag1", "tag2"] # set langfuse Tags
"trace_id": "trace-id22", # set langfuse Trace ID
### OR ###
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
}, },
) )
@ -167,6 +169,9 @@ messages = [
chat(messages) chat(messages)
``` ```
## Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
## Troubleshooting & Errors ## Troubleshooting & Errors
### Data not getting logged to Langfuse ? ### Data not getting logged to Langfuse ?

View file

@ -0,0 +1,97 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenMeter - Usage-Based Billing
[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
<Image img={require('../../img/openmeter.png')} />
:::info
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
join our [discord](https://discord.gg/wuPM9dRgDw)
:::
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
Get your OpenMeter API Key from https://openmeter.cloud/meters
```python
litellm.success_callback = ["openmeter"] # logs cost + usage of successful calls to openmeter
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# pip install langfuse
import litellm
import os
# from https://openmeter.cloud
os.environ["OPENMETER_API_ENDPOINT"] = ""
os.environ["OPENMETER_API_KEY"] = ""
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set langfuse as a callback, litellm will send the data to langfuse
litellm.success_callback = ["openmeter"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
success_callback: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
</TabItem>
</Tabs>
<Image img={require('../../img/openmeter_img_2.png')} />

View file

@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
print(response) print(response)
``` ```
## Redacting Messages, Response Content from Sentry Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
[Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry. [Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry.

View file

@ -53,6 +53,50 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | | open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` |
## Function Calling
```python
from litellm import completion
# set env
os.environ["MISTRAL_API_KEY"] = "your-api-key"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="mistral/mistral-large-latest",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
## Sample Usage - Embedding ## Sample Usage - Embedding
```python ```python
from litellm import embedding from litellm import embedding

View file

@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.
🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb) 🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
:::info
To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
:::
### Quick Start ### Quick Start
``` ```
pip install litellm vllm pip install litellm vllm

View file

@ -0,0 +1,284 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# IBM watsonx.ai
LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
## Environment Variables
```python
os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance
# (required) either one of the following:
os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
os.environ["WATSONX_TOKEN"] = "" # IAM auth token
# optional - can also be passed as params to completion() or embedding()
os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
```
See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
## Usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
)
response = completion(
model="watsonx/meta-llama/llama-3-8b-instruct",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>"
)
```
## Usage - Streaming
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
os.environ["WATSONX_PROJECT_ID"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
stream=True
)
for chunk in response:
print(chunk)
```
#### Example Streaming Output Chunk
```json
{
"choices": [
{
"finish_reason": null,
"index": 0,
"delta": {
"content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
}
}
],
"created": null,
"model": "watsonx/ibm/granite-13b-chat-v2",
"usage": {
"prompt_tokens": null,
"completion_tokens": null,
"total_tokens": null
}
}
```
## Usage - Models in deployment spaces
Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space).
The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`.
```python
import litellm
response = litellm.completion(
model="watsonx/deployment/<deployment_id>",
messages=[{"content": "Hello, how are you?", "role": "user"}],
space_id="<deployment_space_id>"
)
```
## Usage - Embeddings
LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
```python
from litellm import embedding
response = embedding(
model="watsonx/ibm/slate-30m-english-rtrvr",
input=["What is the capital of France?"],
project_id="<my-project-id>"
)
print(response)
# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
```
## OpenAI Proxy Usage
Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
### 1. Save keys in your environment
```bash
export WATSONX_URL=""
export WATSONX_APIKEY=""
export WATSONX_PROJECT_ID=""
```
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="CLI">
```bash
$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml
model_list:
- model_name: llama-3-8b
litellm_params:
# all params accepted by litellm.completion()
model: watsonx/meta-llama/llama-3-8b-instruct
api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
```
</TabItem>
</Tabs>
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "llama-3-8b",
"messages": [
{
"role": "user",
"content": "what is your favorite colour?"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="llama-3-8b", messages=[
{
"role": "user",
"content": "what is your favorite colour?"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "llama-3-8b",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Authentication
### Passing credentials as parameters
You can also pass the credentials as parameters to the completion and embedding functions.
```python
import os
from litellm import completion
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "What is your favorite color?","role": "user"}],
url="",
api_key="",
project_id=""
)
```
## Supported IBM watsonx.ai Models
Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
| Mode Name | Command |
| ---------- | --------- |
| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
## Supported IBM watsonx.ai Embedding Models
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).

View file

@ -1,13 +1,13 @@
# Slack Alerting # 🚨 Alerting
Get alerts for: Get alerts for:
- hanging LLM api calls - Hanging LLM api calls
- failed LLM api calls - Failed LLM api calls
- slow LLM api calls - Slow LLM api calls
- budget Tracking per key/user: - Budget Tracking per key/user:
- When a User/Key crosses their Budget - When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget - When a User/Key is 15% away from crossing their Budget
- failed db read/writes - Failed db read/writes
## Quick Start ## Quick Start

View file

@ -62,9 +62,11 @@ model_list:
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True drop_params: True
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
general_settings: general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
``` ```
:::info :::info

View file

@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber
<TabItem value="basic" label="Basic"> <TabItem value="basic" label="Basic">
**Step 1. Create a file called `litellm_config.yaml`** ### Step 1. CREATE config.yaml
Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env) Example `litellm_config.yaml`
```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-azure-model-deployment>
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
```
**Step 2. Run litellm docker image** ```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-azure-model-deployment>
api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
api_version: "2023-07-01-preview"
```
See the latest available ghcr docker image here:
https://github.com/berriai/litellm/pkgs/container/litellm
Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command.
The `-v` command will mount that file
Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1 ### Step 2. RUN Docker Image
```shell ```shell
docker run \ docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \ -v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e AZURE_API_KEY=d6*********** \ -e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \ -e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \ -p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \ ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug --config /app/config.yaml --detailed_debug
``` ```
**Step 3. Send a Test Request** Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
### Step 3. TEST Request
Pass `model=azure-gpt-3.5` this was set on step 1 Pass `model=azure-gpt-3.5` this was set on step 1
@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
| Docs | When to Use | | Docs | When to Use |
| --- | --- | | --- | --- |
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing | | [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend | | [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers | | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers | | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
## Deploy with Database ## Deploy with Database
### Docker, Kubernetes, Helm Chart ### Docker, Kubernetes, Helm Chart
Requirements:
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)
<Tabs> <Tabs>
@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
```shell ```shell
docker run \ docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \ -v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e LITELLM_MASTER_KEY=sk-1234 \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e AZURE_API_KEY=d6*********** \ -e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \ -e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \ -p 4000:4000 \
@ -267,26 +269,63 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
#### Step 1. Create deployment.yaml #### Step 1. Create deployment.yaml
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: litellm-deployment name: litellm-deployment
spec: spec:
replicas: 1 replicas: 3
selector: selector:
matchLabels: matchLabels:
app: litellm app: litellm
template: template:
metadata: metadata:
labels: labels:
app: litellm app: litellm
spec: spec:
containers: containers:
- name: litellm-container - name: litellm-container
image: ghcr.io/berriai/litellm-database:main-latest image: ghcr.io/berriai/litellm:main-latest
env: imagePullPolicy: Always
- name: DATABASE_URL env:
value: postgresql://<user>:<password>@<host>:<port>/<dbname> - name: AZURE_API_KEY
value: "d6******"
- name: AZURE_API_BASE
value: "https://ope******"
- name: LITELLM_MASTER_KEY
value: "sk-1234"
- name: DATABASE_URL
value: "po**********"
args:
- "--config"
- "/app/proxy_config.yaml" # Update the path to mount the config file
volumeMounts: # Define volume mount for proxy_config.yaml
- name: config-volume
mountPath: /app
readOnly: true
livenessProbe:
httpGet:
path: /health/liveliness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /health/readiness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
volumes: # Define volume to mount proxy_config.yaml
- name: config-volume
configMap:
name: litellm-config
``` ```
```bash ```bash

View file

@ -10,6 +10,7 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme
- [Async Custom Callbacks](#custom-callback-class-async) - [Async Custom Callbacks](#custom-callback-class-async)
- [Async Custom Callback APIs](#custom-callback-apis-async) - [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse) - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets) - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog) - [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb) - [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
@ -401,7 +402,7 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Logging Proxy Input/Output - Langfuse ## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse **Step 1** Install langfuse
@ -419,7 +420,13 @@ litellm_settings:
success_callback: ["langfuse"] success_callback: ["langfuse"]
``` ```
**Step 3**: Start the proxy, make a test request **Step 3**: Set required env variables for logging to langfuse
```shell
export LANGFUSE_PUBLIC_KEY="pk_kk"
export LANGFUSE_SECRET_KEY="sk_ss
```
**Step 4**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
@ -569,6 +576,75 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
All requests made with these keys will log data to their team-specific logging. All requests made with these keys will log data to their team-specific logging.
### Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["langfuse"]
turn_off_message_logging: True
```
## Logging Proxy Cost + Usage - OpenMeter
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
**Required Env Variables**
```bash
# from https://openmeter.cloud
export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
export OPENMETER_API_KEY=""
```
### Quick Start
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
success_callback: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
<Image img={require('../../img/openmeter_img_2.png')} />
## Logging Proxy Input/Output - DataDog ## Logging Proxy Input/Output - DataDog
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

View file

@ -95,7 +95,7 @@ print(response)
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
- `router.aimage_generation()` - async image generation calls - `router.aimage_generation()` - async image generation calls
### Advanced - Routing Strategies ## Advanced - Routing Strategies
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
Router provides 4 strategies for routing your calls across multiple deployments: Router provides 4 strategies for routing your calls across multiple deployments:
@ -278,6 +278,36 @@ router_settings:
routing_strategy_args: {"ttl": 10} routing_strategy_args: {"ttl": 10}
``` ```
### Set Lowest Latency Buffer
Set a buffer within which deployments are candidates for making calls to.
E.g.
if you have 5 deployments
```
https://litellm-prod-1.openai.azure.com/: 0.07s
https://litellm-prod-2.openai.azure.com/: 0.1s
https://litellm-prod-3.openai.azure.com/: 0.1s
https://litellm-prod-4.openai.azure.com/: 0.1s
https://litellm-prod-5.openai.azure.com/: 4.66s
```
to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`.
**In Router**
```python
router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
```
**In Proxy**
```yaml
router_settings:
routing_strategy_args: {"lowest_latency_buffer": 0.5}
```
</TabItem> </TabItem>
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)"> <TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
@ -443,6 +473,35 @@ asyncio.run(router_acompletion())
## Basic Reliability ## Basic Reliability
### Max Parallel Requests (ASYNC)
Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios.
If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit.
```python
from litellm import Router
model_list = [{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
...
"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
}
}]
### OR ###
router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS
# deployment max parallel requests > default max parallel requests
```
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
### Timeouts ### Timeouts
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.

View file

@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
* API Base * API Base
* API Version * API Version
* API Type * API Type
* Project
* Location
* Token
Useful Helper functions: Useful Helper functions:
* [`check_valid_key()`](#check_valid_key) * [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/" os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
``` ```
### Setting Project, Location, Token
For cloud providers:
- Azure
- Bedrock
- GCP
- Watson AI
you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers.
| | LiteLLM param | Watson | Vertex AI | Azure | Bedrock |
|------|--------------|--------------|--------------|--------------|--------------|
| Project | project | watsonx_project | vertex_project | n/a | n/a |
| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
If you want, you can call them by their provider-specific params as well.
## litellm variables ## litellm variables
### litellm.api_key ### litellm.api_key

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 533 KiB

View file

@ -43,6 +43,12 @@ const sidebars = {
"proxy/user_keys", "proxy/user_keys",
"proxy/enterprise", "proxy/enterprise",
"proxy/virtual_keys", "proxy/virtual_keys",
"proxy/alerting",
{
type: "category",
label: "Logging",
items: ["proxy/logging", "proxy/streaming_logging"],
},
"proxy/team_based_routing", "proxy/team_based_routing",
"proxy/ui", "proxy/ui",
"proxy/cost_tracking", "proxy/cost_tracking",
@ -58,11 +64,6 @@ const sidebars = {
"proxy/pii_masking", "proxy/pii_masking",
"proxy/prompt_injection", "proxy/prompt_injection",
"proxy/caching", "proxy/caching",
{
type: "category",
label: "Logging, Alerting",
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
},
"proxy/prometheus", "proxy/prometheus",
"proxy/call_hooks", "proxy/call_hooks",
"proxy/rules", "proxy/rules",
@ -148,6 +149,7 @@ const sidebars = {
"providers/openrouter", "providers/openrouter",
"providers/custom_openai_proxy", "providers/custom_openai_proxy",
"providers/petals", "providers/petals",
"providers/watsonx",
], ],
}, },
"proxy/custom_pricing", "proxy/custom_pricing",
@ -168,6 +170,7 @@ const sidebars = {
"observability/custom_callback", "observability/custom_callback",
"observability/langfuse_integration", "observability/langfuse_integration",
"observability/sentry", "observability/sentry",
"observability/openmeter",
"observability/promptlayer_integration", "observability/promptlayer_integration",
"observability/wandb_integration", "observability/wandb_integration",
"observability/langsmith_integration", "observability/langsmith_integration",
@ -175,7 +178,6 @@ const sidebars = {
"observability/traceloop_integration", "observability/traceloop_integration",
"observability/athina_integration", "observability/athina_integration",
"observability/lunary_integration", "observability/lunary_integration",
"observability/athina_integration",
"observability/helicone_integration", "observability/helicone_integration",
"observability/supabase_integration", "observability/supabase_integration",
`observability/telemetry`, `observability/telemetry`,

View file

@ -6,7 +6,7 @@
"": { "": {
"dependencies": { "dependencies": {
"@hono/node-server": "^1.9.0", "@hono/node-server": "^1.9.0",
"hono": "^4.1.5" "hono": "^4.2.7"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^20.11.17", "@types/node": "^20.11.17",
@ -463,9 +463,9 @@
} }
}, },
"node_modules/hono": { "node_modules/hono": {
"version": "4.1.5", "version": "4.2.7",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz", "resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==", "integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
"engines": { "engines": {
"node": ">=16.0.0" "node": ">=16.0.0"
} }

View file

@ -4,7 +4,7 @@
}, },
"dependencies": { "dependencies": {
"@hono/node-server": "^1.9.0", "@hono/node-server": "^1.9.0",
"hono": "^4.1.5" "hono": "^4.2.7"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^20.11.17", "@types/node": "^20.11.17",

View file

@ -2,7 +2,7 @@
import threading, requests, os import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any, Literal from typing import Callable, List, Optional, Dict, Union, Any, Literal
from litellm.caching import Cache from litellm.caching import Cache
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
from litellm.proxy._types import ( from litellm.proxy._types import (
KeyManagementSystem, KeyManagementSystem,
KeyManagementSettings, KeyManagementSettings,
@ -22,6 +22,7 @@ success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = [] failure_callback: List[Union[str, Callable]] = []
service_callback: List[Union[str, Callable]] = [] service_callback: List[Union[str, Callable]] = []
callbacks: List[Callable] = [] callbacks: List[Callable] = []
_custom_logger_compatible_callbacks: list = ["openmeter"]
_langfuse_default_tags: Optional[ _langfuse_default_tags: Optional[
List[ List[
Literal[ Literal[
@ -45,6 +46,7 @@ _async_failure_callback: List[Callable] = (
) # internal variable - async custom callbacks are routed here. ) # internal variable - async custom callbacks are routed here.
pre_call_rules: List[Callable] = [] pre_call_rules: List[Callable] = []
post_call_rules: List[Callable] = [] post_call_rules: List[Callable] = []
turn_off_message_logging: Optional[bool] = False
## end of callbacks ############# ## end of callbacks #############
email: Optional[str] = ( email: Optional[str] = (
@ -58,6 +60,7 @@ max_tokens = 256 # OpenAI Defaults
drop_params = False drop_params = False
modify_params = False modify_params = False
retry = True retry = True
### AUTH ###
api_key: Optional[str] = None api_key: Optional[str] = None
openai_key: Optional[str] = None openai_key: Optional[str] = None
azure_key: Optional[str] = None azure_key: Optional[str] = None
@ -76,7 +79,12 @@ cloudflare_api_key: Optional[str] = None
baseten_key: Optional[str] = None baseten_key: Optional[str] = None
aleph_alpha_key: Optional[str] = None aleph_alpha_key: Optional[str] = None
nlp_cloud_key: Optional[str] = None nlp_cloud_key: Optional[str] = None
common_cloud_provider_auth_params: dict = {
"params": ["project", "region_name", "token"],
"providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
}
use_client: bool = False use_client: bool = False
ssl_verify: bool = True
disable_streaming_logging: bool = False disable_streaming_logging: bool = False
### GUARDRAILS ### ### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None llamaguard_model_name: Optional[str] = None
@ -298,6 +306,7 @@ aleph_alpha_models: List = []
bedrock_models: List = [] bedrock_models: List = []
deepinfra_models: List = [] deepinfra_models: List = []
perplexity_models: List = [] perplexity_models: List = []
watsonx_models: List = []
for key, value in model_cost.items(): for key, value in model_cost.items():
if value.get("litellm_provider") == "openai": if value.get("litellm_provider") == "openai":
open_ai_chat_completion_models.append(key) open_ai_chat_completion_models.append(key)
@ -342,6 +351,8 @@ for key, value in model_cost.items():
deepinfra_models.append(key) deepinfra_models.append(key)
elif value.get("litellm_provider") == "perplexity": elif value.get("litellm_provider") == "perplexity":
perplexity_models.append(key) perplexity_models.append(key)
elif value.get("litellm_provider") == "watsonx":
watsonx_models.append(key)
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
openai_compatible_endpoints: List = [ openai_compatible_endpoints: List = [
@ -478,6 +489,7 @@ model_list = (
+ perplexity_models + perplexity_models
+ maritalk_models + maritalk_models
+ vertex_language_models + vertex_language_models
+ watsonx_models
) )
provider_list: List = [ provider_list: List = [
@ -516,6 +528,7 @@ provider_list: List = [
"cloudflare", "cloudflare",
"xinference", "xinference",
"fireworks_ai", "fireworks_ai",
"watsonx",
"custom", # custom apis "custom", # custom apis
] ]
@ -537,6 +550,7 @@ models_by_provider: dict = {
"deepinfra": deepinfra_models, "deepinfra": deepinfra_models,
"perplexity": perplexity_models, "perplexity": perplexity_models,
"maritalk": maritalk_models, "maritalk": maritalk_models,
"watsonx": watsonx_models,
} }
# mapping for those models which have larger equivalents # mapping for those models which have larger equivalents
@ -647,9 +661,11 @@ from .llms.bedrock import (
AmazonLlamaConfig, AmazonLlamaConfig,
AmazonStabilityConfig, AmazonStabilityConfig,
AmazonMistralConfig, AmazonMistralConfig,
AmazonBedrockGlobalConfig,
) )
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
from .llms.watsonx import IBMWatsonXAIConfig
from .main import * # type: ignore from .main import * # type: ignore
from .integrations import * from .integrations import *
from .exceptions import ( from .exceptions import (

View file

@ -1,7 +1,7 @@
import logging import logging
set_verbose = False set_verbose = False
json_logs = False
# Create a handler for the logger (you may need to adapt this based on your needs) # Create a handler for the logger (you may need to adapt this based on your needs)
handler = logging.StreamHandler() handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG) handler.setLevel(logging.DEBUG)

View file

@ -12,9 +12,12 @@ import litellm
class LangFuseLogger: class LangFuseLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self, langfuse_public_key=None, langfuse_secret=None): def __init__(
self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
):
try: try:
from langfuse import Langfuse from langfuse import Langfuse
import langfuse
except Exception as e: except Exception as e:
raise Exception( raise Exception(
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m" f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -25,14 +28,20 @@ class LangFuseLogger:
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com") self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
self.langfuse_release = os.getenv("LANGFUSE_RELEASE") self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG") self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
self.Langfuse = Langfuse(
public_key=self.public_key, parameters = {
secret_key=self.secret_key, "public_key": self.public_key,
host=self.langfuse_host, "secret_key": self.secret_key,
release=self.langfuse_release, "host": self.langfuse_host,
debug=self.langfuse_debug, "release": self.langfuse_release,
flush_interval=1, # flush interval in seconds "debug": self.langfuse_debug,
) "flush_interval": flush_interval, # flush interval in seconds
}
if Version(langfuse.version.__version__) >= Version("2.6.0"):
parameters["sdk_integration"] = "litellm"
self.Langfuse = Langfuse(**parameters)
# set the current langfuse project id in the environ # set the current langfuse project id in the environ
# this is used by Alerting to link to the correct project # this is used by Alerting to link to the correct project
@ -77,13 +86,14 @@ class LangFuseLogger:
print_verbose, print_verbose,
level="DEFAULT", level="DEFAULT",
status_message=None, status_message=None,
): ) -> dict:
# Method definition # Method definition
try: try:
print_verbose( print_verbose(
f"Langfuse Logging - Enters logging function for model {kwargs}" f"Langfuse Logging - Enters logging function for model {kwargs}"
) )
litellm_params = kwargs.get("litellm_params", {}) litellm_params = kwargs.get("litellm_params", {})
metadata = ( metadata = (
litellm_params.get("metadata", {}) or {} litellm_params.get("metadata", {}) or {}
@ -137,8 +147,10 @@ class LangFuseLogger:
input = prompt input = prompt
output = response_obj["data"] output = response_obj["data"]
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}") print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
trace_id = None
generation_id = None
if self._is_langfuse_v2(): if self._is_langfuse_v2():
self._log_langfuse_v2( trace_id, generation_id = self._log_langfuse_v2(
user_id, user_id,
metadata, metadata,
litellm_params, litellm_params,
@ -168,10 +180,12 @@ class LangFuseLogger:
f"Langfuse Layer Logging - final response object: {response_obj}" f"Langfuse Layer Logging - final response object: {response_obj}"
) )
verbose_logger.info(f"Langfuse Layer Logging - logging success") verbose_logger.info(f"Langfuse Layer Logging - logging success")
return {"trace_id": trace_id, "generation_id": generation_id}
except: except:
traceback.print_exc() traceback.print_exc()
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}") verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
pass return {"trace_id": None, "generation_id": None}
async def _async_log_event( async def _async_log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -243,7 +257,7 @@ class LangFuseLogger:
response_obj, response_obj,
level, level,
print_verbose, print_verbose,
): ) -> tuple:
import langfuse import langfuse
try: try:
@ -262,22 +276,28 @@ class LangFuseLogger:
tags = metadata_tags tags = metadata_tags
trace_name = metadata.get("trace_name", None) trace_name = metadata.get("trace_name", None)
if trace_name is None: trace_id = metadata.get("trace_id", None)
existing_trace_id = metadata.get("existing_trace_id", None)
if trace_name is None and existing_trace_id is None:
# just log `litellm-{call_type}` as the trace name # just log `litellm-{call_type}` as the trace name
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}" trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_params = { if existing_trace_id is not None:
"name": trace_name, trace_params = {"id": existing_trace_id}
"input": input, else: # don't overwrite an existing trace
"user_id": metadata.get("trace_user_id", user_id), trace_params = {
"id": metadata.get("trace_id", None), "name": trace_name,
"session_id": metadata.get("session_id", None), "input": input,
} "user_id": metadata.get("trace_user_id", user_id),
"id": trace_id,
"session_id": metadata.get("session_id", None),
}
if level == "ERROR": if level == "ERROR":
trace_params["status_message"] = output trace_params["status_message"] = output
else: else:
trace_params["output"] = output trace_params["output"] = output
cost = kwargs.get("response_cost", None) cost = kwargs.get("response_cost", None)
print_verbose(f"trace: {cost}") print_verbose(f"trace: {cost}")
@ -335,7 +355,8 @@ class LangFuseLogger:
kwargs["cache_hit"] = False kwargs["cache_hit"] = False
tags.append(f"cache_hit:{kwargs['cache_hit']}") tags.append(f"cache_hit:{kwargs['cache_hit']}")
clean_metadata["cache_hit"] = kwargs["cache_hit"] clean_metadata["cache_hit"] = kwargs["cache_hit"]
trace_params.update({"tags": tags}) if existing_trace_id is None:
trace_params.update({"tags": tags})
proxy_server_request = litellm_params.get("proxy_server_request", None) proxy_server_request = litellm_params.get("proxy_server_request", None)
if proxy_server_request: if proxy_server_request:
@ -355,8 +376,6 @@ class LangFuseLogger:
"headers": clean_headers, "headers": clean_headers,
} }
print_verbose(f"trace_params: {trace_params}")
trace = self.Langfuse.trace(**trace_params) trace = self.Langfuse.trace(**trace_params)
generation_id = None generation_id = None
@ -373,7 +392,11 @@ class LangFuseLogger:
# just log `litellm-{call_type}` as the generation name # just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}" generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
system_fingerprint = response_obj.get("system_fingerprint", None) if response_obj is not None and "system_fingerprint" in response_obj:
system_fingerprint = response_obj.get("system_fingerprint", None)
else:
system_fingerprint = None
if system_fingerprint is not None: if system_fingerprint is not None:
optional_params["system_fingerprint"] = system_fingerprint optional_params["system_fingerprint"] = system_fingerprint
@ -402,8 +425,9 @@ class LangFuseLogger:
"completion_start_time", None "completion_start_time", None
) )
print_verbose(f"generation_params: {generation_params}") generation_client = trace.generation(**generation_params)
trace.generation(**generation_params) return generation_client.trace_id, generation_id
except Exception as e: except Exception as e:
verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}") verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
return None, None

View file

@ -73,10 +73,6 @@ class LangsmithLogger:
elif type(value) != dict and is_serializable(value=value): elif type(value) != dict and is_serializable(value=value):
new_kwargs[key] = value new_kwargs[key] = value
print(f"type of response: {type(response_obj)}")
for k, v in new_kwargs.items():
print(f"key={k}, type of arg: {type(v)}, value={v}")
if isinstance(response_obj, BaseModel): if isinstance(response_obj, BaseModel):
try: try:
response_obj = response_obj.model_dump() response_obj = response_obj.model_dump()

View file

@ -0,0 +1,123 @@
# What is this?
## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
import dotenv, os, json
import requests
import litellm
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
import uuid
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
class OpenMeterLogger(CustomLogger):
def __init__(self) -> None:
super().__init__()
self.validate_environment()
self.async_http_handler = AsyncHTTPHandler()
self.sync_http_handler = HTTPHandler()
def validate_environment(self):
"""
Expects
OPENMETER_API_ENDPOINT,
OPENMETER_API_KEY,
in the environment
"""
missing_keys = []
if litellm.get_secret("OPENMETER_API_KEY", None) is None:
missing_keys.append("OPENMETER_API_KEY")
if len(missing_keys) > 0:
raise Exception("Missing keys={} in environment.".format(missing_keys))
def _common_logic(self, kwargs: dict, response_obj):
call_id = response_obj.get("id", kwargs.get("litellm_call_id"))
dt = get_utc_datetime().isoformat()
cost = kwargs.get("response_cost", None)
model = kwargs.get("model")
usage = {}
if (
isinstance(response_obj, litellm.ModelResponse)
or isinstance(response_obj, litellm.EmbeddingResponse)
) and hasattr(response_obj, "usage"):
usage = {
"prompt_tokens": response_obj["usage"].get("prompt_tokens", 0),
"completion_tokens": response_obj["usage"].get("completion_tokens", 0),
"total_tokens": response_obj["usage"].get("total_tokens"),
}
return {
"specversion": "1.0",
"type": os.getenv("OPENMETER_EVENT_TYPE", "litellm_tokens"),
"id": call_id,
"time": dt,
"subject": kwargs.get("user", ""), # end-user passed in via 'user' param
"source": "litellm-proxy",
"data": {"model": model, "cost": cost, **usage},
}
def log_success_event(self, kwargs, response_obj, start_time, end_time):
_url = litellm.get_secret(
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
)
if _url.endswith("/"):
_url += "api/v1/events"
else:
_url += "/api/v1/events"
api_key = litellm.get_secret("OPENMETER_API_KEY")
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
self.sync_http_handler.post(
url=_url,
data=_data,
headers={
"Content-Type": "application/cloudevents+json",
"Authorization": "Bearer {}".format(api_key),
},
)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
_url = litellm.get_secret(
"OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
)
if _url.endswith("/"):
_url += "api/v1/events"
else:
_url += "/api/v1/events"
api_key = litellm.get_secret("OPENMETER_API_KEY")
_data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
_headers = {
"Content-Type": "application/cloudevents+json",
"Authorization": "Bearer {}".format(api_key),
}
try:
response = await self.async_http_handler.post(
url=_url,
data=json.dumps(_data),
headers=_headers,
)
response.raise_for_status()
except Exception as e:
print(f"\nAn Exception Occurred - {str(e)}")
if hasattr(response, "text"):
print(f"\nError Message: {response.text}")
raise e

View file

@ -7,11 +7,12 @@ import copy
import traceback import traceback
from litellm._logging import verbose_logger, verbose_proxy_logger from litellm._logging import verbose_logger, verbose_proxy_logger
import litellm import litellm
from typing import List, Literal, Any, Union, Optional from typing import List, Literal, Any, Union, Optional, Dict
from litellm.caching import DualCache from litellm.caching import DualCache
import asyncio import asyncio
import aiohttp import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime
class SlackAlerting: class SlackAlerting:
@ -37,12 +38,28 @@ class SlackAlerting:
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
], ],
alert_to_webhook_url: Optional[
Dict
] = None, # if user wants to separate alerts to diff channels
): ):
self.alerting_threshold = alerting_threshold self.alerting_threshold = alerting_threshold
self.alerting = alerting self.alerting = alerting
self.alert_types = alert_types self.alert_types = alert_types
self.internal_usage_cache = DualCache() self.internal_usage_cache = DualCache()
self.async_http_handler = AsyncHTTPHandler() self.async_http_handler = AsyncHTTPHandler()
self.alert_to_webhook_url = alert_to_webhook_url
self.langfuse_logger = None
try:
from litellm.integrations.langfuse import LangFuseLogger
self.langfuse_logger = LangFuseLogger(
os.getenv("LANGFUSE_PUBLIC_KEY"),
os.getenv("LANGFUSE_SECRET_KEY"),
flush_interval=1,
)
except:
pass
pass pass
@ -51,6 +68,7 @@ class SlackAlerting:
alerting: Optional[List] = None, alerting: Optional[List] = None,
alerting_threshold: Optional[float] = None, alerting_threshold: Optional[float] = None,
alert_types: Optional[List] = None, alert_types: Optional[List] = None,
alert_to_webhook_url: Optional[Dict] = None,
): ):
if alerting is not None: if alerting is not None:
self.alerting = alerting self.alerting = alerting
@ -59,6 +77,13 @@ class SlackAlerting:
if alert_types is not None: if alert_types is not None:
self.alert_types = alert_types self.alert_types = alert_types
if alert_to_webhook_url is not None:
# update the dict
if self.alert_to_webhook_url is None:
self.alert_to_webhook_url = alert_to_webhook_url
else:
self.alert_to_webhook_url.update(alert_to_webhook_url)
async def deployment_in_cooldown(self): async def deployment_in_cooldown(self):
pass pass
@ -81,39 +106,68 @@ class SlackAlerting:
request_info: str, request_info: str,
request_data: Optional[dict] = None, request_data: Optional[dict] = None,
kwargs: Optional[dict] = None, kwargs: Optional[dict] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request",
start_time: Optional[datetime.datetime] = None,
end_time: Optional[datetime.datetime] = None,
): ):
import uuid import uuid
# For now: do nothing as we're debugging why this is not working as expected # For now: do nothing as we're debugging why this is not working as expected
if request_data is not None:
trace_id = request_data.get("metadata", {}).get(
"trace_id", None
) # get langfuse trace id
if trace_id is None:
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
request_data["metadata"]["trace_id"] = trace_id
elif kwargs is not None:
_litellm_params = kwargs.get("litellm_params", {})
trace_id = _litellm_params.get("metadata", {}).get(
"trace_id", None
) # get langfuse trace id
if trace_id is None:
trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
_litellm_params["metadata"]["trace_id"] = trace_id
# Log hanging request as an error on langfuse
if type == "hanging_request":
if self.langfuse_logger is not None:
_logging_kwargs = copy.deepcopy(request_data)
if _logging_kwargs is None:
_logging_kwargs = {}
_logging_kwargs["litellm_params"] = {}
request_data = request_data or {}
_logging_kwargs["litellm_params"]["metadata"] = request_data.get(
"metadata", {}
)
# log to langfuse in a separate thread
import threading
threading.Thread(
target=self.langfuse_logger.log_event,
args=(
_logging_kwargs,
None,
start_time,
end_time,
None,
print,
"ERROR",
"Requests is hanging",
),
).start()
_langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
_langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
# langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
_langfuse_url = (
f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
)
request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
return request_info return request_info
# if request_data is not None:
# trace_id = request_data.get("metadata", {}).get(
# "trace_id", None
# ) # get langfuse trace id
# if trace_id is None:
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
# request_data["metadata"]["trace_id"] = trace_id
# elif kwargs is not None:
# _litellm_params = kwargs.get("litellm_params", {})
# trace_id = _litellm_params.get("metadata", {}).get(
# "trace_id", None
# ) # get langfuse trace id
# if trace_id is None:
# trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
# _litellm_params["metadata"]["trace_id"] = trace_id
# _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
# _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
# # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
# _langfuse_url = (
# f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
# )
# request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
# return request_info
def _response_taking_too_long_callback( def _response_taking_too_long_callback(
self, self,
kwargs, # kwargs to completion kwargs, # kwargs to completion
@ -140,7 +194,6 @@ class SlackAlerting:
raise e raise e
def _get_deployment_latencies_to_alert(self, metadata=None): def _get_deployment_latencies_to_alert(self, metadata=None):
if metadata is None: if metadata is None:
return None return None
@ -156,6 +209,14 @@ class SlackAlerting:
_deployment_latencies = metadata["_latency_per_deployment"] _deployment_latencies = metadata["_latency_per_deployment"]
if len(_deployment_latencies) == 0: if len(_deployment_latencies) == 0:
return None return None
try:
# try sorting deployments by latency
_deployment_latencies = sorted(
_deployment_latencies.items(), key=lambda x: x[1]
)
_deployment_latencies = dict(_deployment_latencies)
except:
pass
for api_base, latency in _deployment_latencies.items(): for api_base, latency in _deployment_latencies.items():
_message_to_send += f"\n{api_base}: {round(latency,2)}s" _message_to_send += f"\n{api_base}: {round(latency,2)}s"
_message_to_send = "```" + _message_to_send + "```" _message_to_send = "```" + _message_to_send + "```"
@ -171,8 +232,6 @@ class SlackAlerting:
if self.alerting is None or self.alert_types is None: if self.alerting is None or self.alert_types is None:
return return
if "llm_too_slow" not in self.alert_types:
return
time_difference_float, model, api_base, messages = ( time_difference_float, model, api_base, messages = (
self._response_taking_too_long_callback( self._response_taking_too_long_callback(
kwargs=kwargs, kwargs=kwargs,
@ -185,7 +244,7 @@ class SlackAlerting:
if time_difference_float > self.alerting_threshold: if time_difference_float > self.alerting_threshold:
if "langfuse" in litellm.success_callback: if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert( request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info, kwargs=kwargs request_info=request_info, kwargs=kwargs, type="slow_response"
) )
# add deployment latencies to alert # add deployment latencies to alert
if ( if (
@ -205,6 +264,7 @@ class SlackAlerting:
await self.send_alert( await self.send_alert(
message=slow_message + request_info, message=slow_message + request_info,
level="Low", level="Low",
alert_type="llm_too_slow",
) )
async def log_failure_event(self, original_exception: Exception): async def log_failure_event(self, original_exception: Exception):
@ -212,8 +272,8 @@ class SlackAlerting:
async def response_taking_too_long( async def response_taking_too_long(
self, self,
start_time: Optional[float] = None, start_time: Optional[datetime.datetime] = None,
end_time: Optional[float] = None, end_time: Optional[datetime.datetime] = None,
type: Literal["hanging_request", "slow_response"] = "hanging_request", type: Literal["hanging_request", "slow_response"] = "hanging_request",
request_data: Optional[dict] = None, request_data: Optional[dict] = None,
): ):
@ -233,17 +293,10 @@ class SlackAlerting:
except: except:
messages = "" messages = ""
request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`" request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info, request_data=request_data
)
else: else:
request_info = "" request_info = ""
if type == "hanging_request": if type == "hanging_request":
# Simulate a long-running operation that could take more than 5 minutes
if "llm_requests_hanging" not in self.alert_types:
return
await asyncio.sleep( await asyncio.sleep(
self.alerting_threshold self.alerting_threshold
) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests ) # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
@ -281,6 +334,15 @@ class SlackAlerting:
f"`Requests are hanging - {self.alerting_threshold}s+ request time`" f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
) )
if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert(
request_info=request_info,
request_data=request_data,
type="hanging_request",
start_time=start_time,
end_time=end_time,
)
# add deployment latencies to alert # add deployment latencies to alert
_deployment_latency_map = self._get_deployment_latencies_to_alert( _deployment_latency_map = self._get_deployment_latencies_to_alert(
metadata=request_data.get("metadata", {}) metadata=request_data.get("metadata", {})
@ -291,6 +353,7 @@ class SlackAlerting:
await self.send_alert( await self.send_alert(
message=alerting_message + request_info, message=alerting_message + request_info,
level="Medium", level="Medium",
alert_type="llm_requests_hanging",
) )
async def budget_alerts( async def budget_alerts(
@ -336,8 +399,7 @@ class SlackAlerting:
user_info = f"\nUser ID: {user_id}\n Error {error_message}" user_info = f"\nUser ID: {user_id}\n Error {error_message}"
message = "Failed Tracking Cost for" + user_info message = "Failed Tracking Cost for" + user_info
await self.send_alert( await self.send_alert(
message=message, message=message, level="High", alert_type="budget_alerts"
level="High",
) )
return return
elif type == "projected_limit_exceeded" and user_info is not None: elif type == "projected_limit_exceeded" and user_info is not None:
@ -353,8 +415,7 @@ class SlackAlerting:
""" """
message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}""" message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
await self.send_alert( await self.send_alert(
message=message, message=message, level="High", alert_type="budget_alerts"
level="High",
) )
return return
else: else:
@ -382,8 +443,7 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=message) result = await _cache.async_get_cache(key=message)
if result is None: if result is None:
await self.send_alert( await self.send_alert(
message=message, message=message, level="High", alert_type="budget_alerts"
level="High",
) )
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200) await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
return return
@ -395,8 +455,7 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=cache_key) result = await _cache.async_get_cache(key=cache_key)
if result is None: if result is None:
await self.send_alert( await self.send_alert(
message=message, message=message, level="Medium", alert_type="budget_alerts"
level="Medium",
) )
await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200) await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
@ -409,15 +468,25 @@ class SlackAlerting:
result = await _cache.async_get_cache(key=message) result = await _cache.async_get_cache(key=message)
if result is None: if result is None:
await self.send_alert( await self.send_alert(
message=message, message=message, level="Low", alert_type="budget_alerts"
level="Low",
) )
await _cache.async_set_cache(key=message, value="SENT", ttl=2419200) await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
return return
return return
async def send_alert(self, message: str, level: Literal["Low", "Medium", "High"]): async def send_alert(
self,
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
],
):
""" """
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298 Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -432,12 +501,6 @@ class SlackAlerting:
level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'. level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
message: str - what is the alert about message: str - what is the alert about
""" """
print(
"inside send alert for slack, message: ",
message,
"self.alerting: ",
self.alerting,
)
if self.alerting is None: if self.alerting is None:
return return
@ -453,7 +516,15 @@ class SlackAlerting:
if _proxy_base_url is not None: if _proxy_base_url is not None:
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`" formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None) # check if we find the slack webhook url in self.alert_to_webhook_url
if (
self.alert_to_webhook_url is not None
and alert_type in self.alert_to_webhook_url
):
slack_webhook_url = self.alert_to_webhook_url[alert_type]
else:
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
if slack_webhook_url is None: if slack_webhook_url is None:
raise Exception("Missing SLACK_WEBHOOK_URL from environment") raise Exception("Missing SLACK_WEBHOOK_URL from environment")
payload = {"text": formatted_message} payload = {"text": formatted_message}

View file

@ -96,6 +96,15 @@ class AzureOpenAIConfig(OpenAIConfig):
top_p, top_p,
) )
def get_mapped_special_auth_params(self) -> dict:
return {"token": "azure_ad_token"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "token":
optional_params["azure_ad_token"] = value
return optional_params
def select_azure_base_url_or_endpoint(azure_client_params: dict): def select_azure_base_url_or_endpoint(azure_client_params: dict):
# azure_client_params = { # azure_client_params = {

View file

@ -29,6 +29,24 @@ class BedrockError(Exception):
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
class AmazonBedrockGlobalConfig:
def __init__(self):
pass
def get_mapped_special_auth_params(self) -> dict:
"""
Mapping of common auth params across bedrock/vertex/azure/watsonx
"""
return {"region_name": "aws_region_name"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
class AmazonTitanConfig: class AmazonTitanConfig:
""" """
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1 Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
@ -666,6 +684,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
prompt = prompt_factory( prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock" model=model, messages=messages, custom_llm_provider="bedrock"
) )
elif provider == "meta":
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="bedrock"
)
else: else:
prompt = "" prompt = ""
for message in messages: for message in messages:
@ -945,7 +967,7 @@ def completion(
original_response=json.dumps(response_body), original_response=json.dumps(response_body),
additional_args={"complete_input_dict": data}, additional_args={"complete_input_dict": data},
) )
print_verbose(f"raw model_response: {response}") print_verbose(f"raw model_response: {response_body}")
## RESPONSE OBJECT ## RESPONSE OBJECT
outputText = "default" outputText = "default"
if provider == "ai21": if provider == "ai21":
@ -1058,6 +1080,7 @@ def completion(
outputText = response_body.get("results")[0].get("outputText") outputText = response_body.get("results")[0].get("outputText")
response_metadata = response.get("ResponseMetadata", {}) response_metadata = response.get("ResponseMetadata", {})
if response_metadata.get("HTTPStatusCode", 500) >= 400: if response_metadata.get("HTTPStatusCode", 500) >= 400:
raise BedrockError( raise BedrockError(
message=outputText, message=outputText,
@ -1093,11 +1116,13 @@ def completion(
prompt_tokens = response_metadata.get( prompt_tokens = response_metadata.get(
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt)) "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
) )
_text_response = model_response["choices"][0]["message"].get("content", "")
completion_tokens = response_metadata.get( completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count", "x-amzn-bedrock-output-token-count",
len( len(
encoding.encode( encoding.encode(
model_response["choices"][0]["message"].get("content", "") _text_response,
disallowed_special=(),
) )
), ),
) )

View file

@ -213,12 +213,13 @@ def get_ollama_response(
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop" model_response["choices"][0]["finish_reason"] = "stop"
if optional_params.get("format", "") == "json": if optional_params.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message( message = litellm.Message(
content=None, content=None,
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": {"arguments": response_json["response"], "name": ""}, "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"type": "function", "type": "function",
} }
], ],
@ -310,15 +311,13 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop" model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json": if data.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message( message = litellm.Message(
content=None, content=None,
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": { "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"arguments": response_json["response"],
"name": "",
},
"type": "function", "type": "function",
} }
], ],

View file

@ -285,15 +285,13 @@ def get_ollama_response(
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop" model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json": if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message( message = litellm.Message(
content=None, content=None,
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": { "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"arguments": response_json["message"]["content"],
"name": "",
},
"type": "function", "type": "function",
} }
], ],
@ -415,15 +413,13 @@ async def ollama_acompletion(
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop" model_response["choices"][0]["finish_reason"] = "stop"
if data.get("format", "") == "json": if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message( message = litellm.Message(
content=None, content=None,
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": { "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
"arguments": response_json["message"]["content"],
"name": function_name or "",
},
"type": "function", "type": "function",
} }
], ],

View file

@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
) )
else: else:
openai_aclient = client openai_aclient = client
## LOGGING ## LOGGING
logging_obj.pre_call( logging_obj.pre_call(
input=data["messages"], input=data["messages"],

View file

@ -3,8 +3,14 @@ import requests, traceback
import json, re, xml.etree.ElementTree as ET import json, re, xml.etree.ElementTree as ET
from jinja2 import Template, exceptions, meta, BaseLoader from jinja2 import Template, exceptions, meta, BaseLoader
from jinja2.sandbox import ImmutableSandboxedEnvironment from jinja2.sandbox import ImmutableSandboxedEnvironment
from typing import Optional, Any from typing import (
from typing import List Any,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
)
import litellm import litellm
@ -431,6 +437,35 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
return prompt return prompt
### IBM Granite
def ibm_granite_pt(messages: list):
"""
IBM's Granite models uses the template:
<|system|> {system_message} <|user|> {user_message} <|assistant|> {assistant_message}
See: https://www.ibm.com/docs/en/watsonx-as-a-service?topic=solutions-supported-foundation-models
"""
return custom_prompt(
messages=messages,
role_dict={
"system": {
"pre_message": "<|system|>\n",
"post_message": "\n",
},
"user": {
"pre_message": "<|user|>\n",
"post_message": "\n",
},
"assistant": {
"pre_message": "<|assistant|>\n",
"post_message": "\n",
},
},
).strip()
### ANTHROPIC ### ### ANTHROPIC ###
@ -1017,6 +1052,30 @@ def get_system_prompt(messages):
return system_prompt, messages return system_prompt, messages
def convert_to_documents(
observations: Any,
) -> List[MutableMapping]:
"""Converts observations into a 'document' dict"""
documents: List[MutableMapping] = []
if isinstance(observations, str):
# strings are turned into a key/value pair and a key of 'output' is added.
observations = [{"output": observations}]
elif isinstance(observations, Mapping):
# single mappings are transformed into a list to simplify the rest of the code.
observations = [observations]
elif not isinstance(observations, Sequence):
# all other types are turned into a key/value pair within a list
observations = [{"output": observations}]
for doc in observations:
if not isinstance(doc, Mapping):
# types that aren't Mapping are turned into a key/value pair.
doc = {"output": doc}
documents.append(doc)
return documents
def convert_openai_message_to_cohere_tool_result(message): def convert_openai_message_to_cohere_tool_result(message):
""" """
OpenAI message with a tool result looks like: OpenAI message with a tool result looks like:
@ -1058,7 +1117,7 @@ def convert_openai_message_to_cohere_tool_result(message):
"parameters": {"location": "San Francisco, CA"}, "parameters": {"location": "San Francisco, CA"},
"generation_id": tool_call_id, "generation_id": tool_call_id,
}, },
"outputs": [content], "outputs": convert_to_documents(content),
} }
return cohere_tool_result return cohere_tool_result
@ -1071,7 +1130,7 @@ def cohere_message_pt(messages: list):
if message["role"] == "tool": if message["role"] == "tool":
tool_result = convert_openai_message_to_cohere_tool_result(message) tool_result = convert_openai_message_to_cohere_tool_result(message)
tool_results.append(tool_result) tool_results.append(tool_result)
else: elif message.get("content"):
prompt += message["content"] + "\n\n" prompt += message["content"] + "\n\n"
prompt = prompt.rstrip() prompt = prompt.rstrip()
return prompt, tool_results return prompt, tool_results
@ -1346,12 +1405,47 @@ def prompt_factory(
return anthropic_pt(messages=messages) return anthropic_pt(messages=messages)
elif "mistral." in model: elif "mistral." in model:
return mistral_instruct_pt(messages=messages) return mistral_instruct_pt(messages=messages)
elif "llama2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)
elif "llama3" in model and "instruct" in model:
return hf_chat_template(
model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=messages,
)
elif custom_llm_provider == "perplexity": elif custom_llm_provider == "perplexity":
for message in messages: for message in messages:
message.pop("name", None) message.pop("name", None)
return messages return messages
elif custom_llm_provider == "azure_text": elif custom_llm_provider == "azure_text":
return azure_text_pt(messages=messages) return azure_text_pt(messages=messages)
elif custom_llm_provider == "watsonx":
if "granite" in model and "chat" in model:
# granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
return ibm_granite_pt(messages=messages)
elif "ibm-mistral" in model and "instruct" in model:
# models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
return mistral_instruct_pt(messages=messages)
elif "meta-llama/llama-3" in model and "instruct" in model:
# https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
return custom_prompt(
role_dict={
"system": {
"pre_message": "<|start_header_id|>system<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
"user": {
"pre_message": "<|start_header_id|>user<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
"assistant": {
"pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
"post_message": "<|eot_id|>",
},
},
messages=messages,
initial_prompt_value="<|begin_of_text|>",
final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
)
try: try:
if "meta-llama/llama-2" in model and "chat" in model: if "meta-llama/llama-2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages) return llama_2_chat_pt(messages=messages)
@ -1359,11 +1453,8 @@ def prompt_factory(
"meta-llama/llama-3" in model or "meta-llama-3" in model "meta-llama/llama-3" in model or "meta-llama-3" in model
) and "instruct" in model: ) and "instruct" in model:
return hf_chat_template( return hf_chat_template(
model=model, model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=messages, messages=messages,
chat_template=known_tokenizer_config[ # type: ignore
"meta-llama/Meta-Llama-3-8B-Instruct"
]["tokenizer"]["chat_template"],
) )
elif ( elif (
"tiiuae/falcon" in model "tiiuae/falcon" in model

View file

@ -112,10 +112,16 @@ def start_prediction(
} }
initial_prediction_data = { initial_prediction_data = {
"version": version_id,
"input": input_data, "input": input_data,
} }
if ":" in version_id and len(version_id) > 64:
model_parts = version_id.split(":")
if (
len(model_parts) > 1 and len(model_parts[1]) == 64
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
initial_prediction_data["version"] = model_parts[1]
## LOGGING ## LOGGING
logging_obj.pre_call( logging_obj.pre_call(
input=input_data["prompt"], input=input_data["prompt"],

View file

@ -143,7 +143,9 @@ class VertexAIConfig:
optional_params["temperature"] = value optional_params["temperature"] = value
if param == "top_p": if param == "top_p":
optional_params["top_p"] = value optional_params["top_p"] = value
if param == "stream": if (
param == "stream" and value == True
): # sending stream = False, can cause it to get passed unchecked and raise issues
optional_params["stream"] = value optional_params["stream"] = value
if param == "n": if param == "n":
optional_params["candidate_count"] = value optional_params["candidate_count"] = value
@ -182,6 +184,20 @@ class VertexAIConfig:
pass pass
return optional_params return optional_params
def get_mapped_special_auth_params(self) -> dict:
"""
Common auth params across bedrock/vertex_ai/azure/watsonx
"""
return {"project": "vertex_project", "region_name": "vertex_location"}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
import asyncio import asyncio
@ -527,6 +543,7 @@ def completion(
"instances": instances, "instances": instances,
"vertex_location": vertex_location, "vertex_location": vertex_location,
"vertex_project": vertex_project, "vertex_project": vertex_project,
"safety_settings": safety_settings,
**optional_params, **optional_params,
} }
if optional_params.get("stream", False) is True: if optional_params.get("stream", False) is True:
@ -541,8 +558,9 @@ def completion(
tools = optional_params.pop("tools", None) tools = optional_params.pop("tools", None)
prompt, images = _gemini_vision_convert_messages(messages=messages) prompt, images = _gemini_vision_convert_messages(messages=messages)
content = [prompt] + images content = [prompt] + images
if "stream" in optional_params and optional_params["stream"] == True: stream = optional_params.pop("stream", False)
stream = optional_params.pop("stream") if stream == True:
request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n" request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
logging_obj.pre_call( logging_obj.pre_call(
input=prompt, input=prompt,
@ -810,6 +828,7 @@ async def async_completion(
instances=None, instances=None,
vertex_project=None, vertex_project=None,
vertex_location=None, vertex_location=None,
safety_settings=None,
**optional_params, **optional_params,
): ):
""" """
@ -820,6 +839,7 @@ async def async_completion(
print_verbose("\nMaking VertexAI Gemini Pro/Vision Call") print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
print_verbose(f"\nProcessing input messages = {messages}") print_verbose(f"\nProcessing input messages = {messages}")
tools = optional_params.pop("tools", None) tools = optional_params.pop("tools", None)
stream = optional_params.pop("stream", False)
prompt, images = _gemini_vision_convert_messages(messages=messages) prompt, images = _gemini_vision_convert_messages(messages=messages)
content = [prompt] + images content = [prompt] + images
@ -840,6 +860,7 @@ async def async_completion(
response = await llm_model._generate_content_async( response = await llm_model._generate_content_async(
contents=content, contents=content,
generation_config=optional_params, generation_config=optional_params,
safety_settings=safety_settings,
tools=tools, tools=tools,
) )
@ -1018,6 +1039,7 @@ async def async_streaming(
instances=None, instances=None,
vertex_project=None, vertex_project=None,
vertex_location=None, vertex_location=None,
safety_settings=None,
**optional_params, **optional_params,
): ):
""" """
@ -1044,6 +1066,7 @@ async def async_streaming(
response = await llm_model._generate_content_streaming_async( response = await llm_model._generate_content_streaming_async(
contents=content, contents=content,
generation_config=optional_params, generation_config=optional_params,
safety_settings=safety_settings,
tools=tools, tools=tools,
) )

609
litellm/llms/watsonx.py Normal file
View file

@ -0,0 +1,609 @@
from enum import Enum
import json, types, time # noqa: E401
from contextlib import contextmanager
from typing import Callable, Dict, Optional, Any, Union, List
import httpx
import requests
import litellm
from litellm.utils import ModelResponse, get_secret, Usage
from .base import BaseLLM
from .prompt_templates import factory as ptf
class WatsonXAIError(Exception):
def __init__(self, status_code, message, url: Optional[str] = None):
self.status_code = status_code
self.message = message
url = url or "https://https://us-south.ml.cloud.ibm.com"
self.request = httpx.Request(method="POST", url=url)
self.response = httpx.Response(status_code=status_code, request=self.request)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class IBMWatsonXAIConfig:
"""
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
(See ibm_watsonx_ai.metanames.GenTextParamsMetaNames for a list of all available params)
Supported params for all available watsonx.ai foundational models.
- `decoding_method` (str): One of "greedy" or "sample"
- `temperature` (float): Sets the model temperature for sampling - not available when decoding_method='greedy'.
- `max_new_tokens` (integer): Maximum length of the generated tokens.
- `min_new_tokens` (integer): Maximum length of input tokens. Any more than this will be truncated.
- `length_penalty` (dict): A dictionary with keys "decay_factor" and "start_index".
- `stop_sequences` (string[]): list of strings to use as stop sequences.
- `top_k` (integer): top k for sampling - not available when decoding_method='greedy'.
- `top_p` (integer): top p for sampling - not available when decoding_method='greedy'.
- `repetition_penalty` (float): token repetition penalty during text generation.
- `truncate_input_tokens` (integer): Truncate input tokens to this length.
- `include_stop_sequences` (bool): If True, the stop sequence will be included at the end of the generated text in the case of a match.
- `return_options` (dict): A dictionary of options to return. Options include "input_text", "generated_tokens", "input_tokens", "token_ranks". Values are boolean.
- `random_seed` (integer): Random seed for text generation.
- `moderations` (dict): Dictionary of properties that control the moderations, for usages such as Hate and profanity (HAP) and PII filtering.
- `stream` (bool): If True, the model will return a stream of responses.
"""
decoding_method: Optional[str] = "sample"
temperature: Optional[float] = None
max_new_tokens: Optional[int] = None # litellm.max_tokens
min_new_tokens: Optional[int] = None
length_penalty: Optional[dict] = None # e.g {"decay_factor": 2.5, "start_index": 5}
stop_sequences: Optional[List[str]] = None # e.g ["}", ")", "."]
top_k: Optional[int] = None
top_p: Optional[float] = None
repetition_penalty: Optional[float] = None
truncate_input_tokens: Optional[int] = None
include_stop_sequences: Optional[bool] = False
return_options: Optional[Dict[str, bool]] = None
random_seed: Optional[int] = None # e.g 42
moderations: Optional[dict] = None
stream: Optional[bool] = False
def __init__(
self,
decoding_method: Optional[str] = None,
temperature: Optional[float] = None,
max_new_tokens: Optional[int] = None,
min_new_tokens: Optional[int] = None,
length_penalty: Optional[dict] = None,
stop_sequences: Optional[List[str]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
repetition_penalty: Optional[float] = None,
truncate_input_tokens: Optional[int] = None,
include_stop_sequences: Optional[bool] = None,
return_options: Optional[dict] = None,
random_seed: Optional[int] = None,
moderations: Optional[dict] = None,
stream: Optional[bool] = None,
**kwargs,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(self):
return [
"temperature", # equivalent to temperature
"max_tokens", # equivalent to max_new_tokens
"top_p", # equivalent to top_p
"frequency_penalty", # equivalent to repetition_penalty
"stop", # equivalent to stop_sequences
"seed", # equivalent to random_seed
"stream", # equivalent to stream
]
def get_mapped_special_auth_params(self) -> dict:
"""
Common auth params across bedrock/vertex_ai/azure/watsonx
"""
return {
"project": "watsonx_project",
"region_name": "watsonx_region_name",
"token": "watsonx_token",
}
def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
mapped_params = self.get_mapped_special_auth_params()
for param, value in non_default_params.items():
if param in mapped_params:
optional_params[mapped_params[param]] = value
return optional_params
def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
# handle anthropic prompts and amazon titan prompts
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_dict = custom_prompt_dict[model]
prompt = ptf.custom_prompt(
messages=messages,
role_dict=model_prompt_dict.get(
"role_dict", model_prompt_dict.get("roles")
),
initial_prompt_value=model_prompt_dict.get("initial_prompt_value", ""),
final_prompt_value=model_prompt_dict.get("final_prompt_value", ""),
bos_token=model_prompt_dict.get("bos_token", ""),
eos_token=model_prompt_dict.get("eos_token", ""),
)
return prompt
elif provider == "ibm":
prompt = ptf.prompt_factory(
model=model, messages=messages, custom_llm_provider="watsonx"
)
elif provider == "ibm-mistralai":
prompt = ptf.mistral_instruct_pt(messages=messages)
else:
prompt = ptf.prompt_factory(
model=model, messages=messages, custom_llm_provider="watsonx"
)
return prompt
class WatsonXAIEndpoint(str, Enum):
TEXT_GENERATION = "/ml/v1/text/generation"
TEXT_GENERATION_STREAM = "/ml/v1/text/generation_stream"
DEPLOYMENT_TEXT_GENERATION = "/ml/v1/deployments/{deployment_id}/text/generation"
DEPLOYMENT_TEXT_GENERATION_STREAM = (
"/ml/v1/deployments/{deployment_id}/text/generation_stream"
)
EMBEDDINGS = "/ml/v1/text/embeddings"
PROMPTS = "/ml/v1/prompts"
class IBMWatsonXAI(BaseLLM):
"""
Class to interface with IBM Watsonx.ai API for text generation and embeddings.
Reference: https://cloud.ibm.com/apidocs/watsonx-ai
"""
api_version = "2024-03-13"
def __init__(self) -> None:
super().__init__()
def _prepare_text_generation_req(
self,
model_id: str,
prompt: str,
stream: bool,
optional_params: dict,
print_verbose: Optional[Callable] = None,
) -> dict:
"""
Get the request parameters for text generation.
"""
api_params = self._get_api_params(optional_params, print_verbose=print_verbose)
# build auth headers
api_token = api_params.get("token")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
extra_body_params = optional_params.pop("extra_body", {})
optional_params.update(extra_body_params)
# init the payload to the text generation call
payload = {
"input": prompt,
"moderations": optional_params.pop("moderations", {}),
"parameters": optional_params,
}
request_params = dict(version=api_params["api_version"])
# text generation endpoint deployment or model / stream or not
if model_id.startswith("deployment/"):
# deployment models are passed in as 'deployment/<deployment_id>'
if api_params.get("space_id") is None:
raise WatsonXAIError(
status_code=401,
url=api_params["url"],
message="Error: space_id is required for models called using the 'deployment/' endpoint. Pass in the space_id as a parameter or set it in the WX_SPACE_ID environment variable.",
)
deployment_id = "/".join(model_id.split("/")[1:])
endpoint = (
WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION_STREAM.value
if stream
else WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION.value
)
endpoint = endpoint.format(deployment_id=deployment_id)
else:
payload["model_id"] = model_id
payload["project_id"] = api_params["project_id"]
endpoint = (
WatsonXAIEndpoint.TEXT_GENERATION_STREAM
if stream
else WatsonXAIEndpoint.TEXT_GENERATION
)
url = api_params["url"].rstrip("/") + endpoint
return dict(
method="POST", url=url, headers=headers, json=payload, params=request_params
)
def _get_api_params(
self, params: dict, print_verbose: Optional[Callable] = None
) -> dict:
"""
Find watsonx.ai credentials in the params or environment variables and return the headers for authentication.
"""
# Load auth variables from params
url = params.pop("url", params.pop("api_base", params.pop("base_url", None)))
api_key = params.pop("apikey", None)
token = params.pop("token", None)
project_id = params.pop(
"project_id", params.pop("watsonx_project", None)
) # watsonx.ai project_id - allow 'watsonx_project' to be consistent with how vertex project implementation works -> reduce provider-specific params
space_id = params.pop("space_id", None) # watsonx.ai deployment space_id
region_name = params.pop("region_name", params.pop("region", None))
if region_name is None:
region_name = params.pop(
"watsonx_region_name", params.pop("watsonx_region", None)
) # consistent with how vertex ai + aws regions are accepted
wx_credentials = params.pop(
"wx_credentials",
params.pop(
"watsonx_credentials", None
), # follow {provider}_credentials, same as vertex ai
)
api_version = params.pop("api_version", IBMWatsonXAI.api_version)
# Load auth variables from environment variables
if url is None:
url = (
get_secret("WATSONX_API_BASE") # consistent with 'AZURE_API_BASE'
or get_secret("WATSONX_URL")
or get_secret("WX_URL")
or get_secret("WML_URL")
)
if api_key is None:
api_key = (
get_secret("WATSONX_APIKEY")
or get_secret("WATSONX_API_KEY")
or get_secret("WX_API_KEY")
)
if token is None:
token = get_secret("WATSONX_TOKEN") or get_secret("WX_TOKEN")
if project_id is None:
project_id = (
get_secret("WATSONX_PROJECT_ID")
or get_secret("WX_PROJECT_ID")
or get_secret("PROJECT_ID")
)
if region_name is None:
region_name = (
get_secret("WATSONX_REGION")
or get_secret("WX_REGION")
or get_secret("REGION")
)
if space_id is None:
space_id = (
get_secret("WATSONX_DEPLOYMENT_SPACE_ID")
or get_secret("WATSONX_SPACE_ID")
or get_secret("WX_SPACE_ID")
or get_secret("SPACE_ID")
)
# credentials parsing
if wx_credentials is not None:
url = wx_credentials.get("url", url)
api_key = wx_credentials.get(
"apikey", wx_credentials.get("api_key", api_key)
)
token = wx_credentials.get(
"token",
wx_credentials.get(
"watsonx_token", token
), # follow format of {provider}_token, same as azure - e.g. 'azure_ad_token=..'
)
# verify that all required credentials are present
if url is None:
raise WatsonXAIError(
status_code=401,
message="Error: Watsonx URL not set. Set WX_URL in environment variables or pass in as a parameter.",
)
if token is None and api_key is not None:
# generate the auth token
if print_verbose:
print_verbose("Generating IAM token for Watsonx.ai")
token = self.generate_iam_token(api_key)
elif token is None and api_key is None:
raise WatsonXAIError(
status_code=401,
url=url,
message="Error: API key or token not found. Set WX_API_KEY or WX_TOKEN in environment variables or pass in as a parameter.",
)
if project_id is None:
raise WatsonXAIError(
status_code=401,
url=url,
message="Error: Watsonx project_id not set. Set WX_PROJECT_ID in environment variables or pass in as a parameter.",
)
return {
"url": url,
"api_key": api_key,
"token": token,
"project_id": project_id,
"space_id": space_id,
"region_name": region_name,
"api_version": api_version,
}
def completion(
self,
model: str,
messages: list,
custom_prompt_dict: dict,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
logging_obj,
optional_params: dict,
litellm_params: Optional[dict] = None,
logger_fn=None,
timeout: Optional[float] = None,
):
"""
Send a text generation request to the IBM Watsonx.ai API.
Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
"""
stream = optional_params.pop("stream", False)
# Load default configs
config = IBMWatsonXAIConfig.get_config()
for k, v in config.items():
if k not in optional_params:
optional_params[k] = v
# Make prompt to send to model
provider = model.split("/")[0]
# model_name = "/".join(model.split("/")[1:])
prompt = convert_messages_to_prompt(
model, messages, provider, custom_prompt_dict
)
def process_text_request(request_params: dict) -> ModelResponse:
with self._manage_response(
request_params, logging_obj=logging_obj, input=prompt, timeout=timeout
) as resp:
json_resp = resp.json()
generated_text = json_resp["results"][0]["generated_text"]
prompt_tokens = json_resp["results"][0]["input_token_count"]
completion_tokens = json_resp["results"][0]["generated_token_count"]
model_response["choices"][0]["message"]["content"] = generated_text
model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
model_response["created"] = int(time.time())
model_response["model"] = model
setattr(
model_response,
"usage",
Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
def process_stream_request(
request_params: dict,
) -> litellm.CustomStreamWrapper:
# stream the response - generated chunks will be handled
# by litellm.utils.CustomStreamWrapper.handle_watsonx_stream
with self._manage_response(
request_params,
logging_obj=logging_obj,
stream=True,
input=prompt,
timeout=timeout,
) as resp:
response = litellm.CustomStreamWrapper(
resp.iter_lines(),
model=model,
custom_llm_provider="watsonx",
logging_obj=logging_obj,
)
return response
try:
## Get the response from the model
req_params = self._prepare_text_generation_req(
model_id=model,
prompt=prompt,
stream=stream,
optional_params=optional_params,
print_verbose=print_verbose,
)
if stream:
return process_stream_request(req_params)
else:
return process_text_request(req_params)
except WatsonXAIError as e:
raise e
except Exception as e:
raise WatsonXAIError(status_code=500, message=str(e))
def embedding(
self,
model: str,
input: Union[list, str],
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
):
"""
Send a text embedding request to the IBM Watsonx.ai API.
"""
if optional_params is None:
optional_params = {}
# Load default configs
config = IBMWatsonXAIConfig.get_config()
for k, v in config.items():
if k not in optional_params:
optional_params[k] = v
# Load auth variables from environment variables
if isinstance(input, str):
input = [input]
if api_key is not None:
optional_params["api_key"] = api_key
api_params = self._get_api_params(optional_params)
# build auth headers
api_token = api_params.get("token")
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
# init the payload to the text generation call
payload = {
"inputs": input,
"model_id": model,
"project_id": api_params["project_id"],
"parameters": optional_params,
}
request_params = dict(version=api_params["api_version"])
url = api_params["url"].rstrip("/") + WatsonXAIEndpoint.EMBEDDINGS
# request = httpx.Request(
# "POST", url, headers=headers, json=payload, params=request_params
# )
req_params = {
"method": "POST",
"url": url,
"headers": headers,
"json": payload,
"params": request_params,
}
with self._manage_response(
req_params, logging_obj=logging_obj, input=input
) as resp:
json_resp = resp.json()
results = json_resp.get("results", [])
embedding_response = []
for idx, result in enumerate(results):
embedding_response.append(
{"object": "embedding", "index": idx, "embedding": result["embedding"]}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = json_resp.get("input_token_count", 0)
model_response.usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
return model_response
def generate_iam_token(self, api_key=None, **params):
headers = {}
headers["Content-Type"] = "application/x-www-form-urlencoded"
if api_key is None:
api_key = get_secret("WX_API_KEY") or get_secret("WATSONX_API_KEY")
if api_key is None:
raise ValueError("API key is required")
headers["Accept"] = "application/json"
data = {
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
"apikey": api_key,
}
response = httpx.post(
"https://iam.cloud.ibm.com/identity/token", data=data, headers=headers
)
response.raise_for_status()
json_data = response.json()
iam_access_token = json_data["access_token"]
self.token = iam_access_token
return iam_access_token
@contextmanager
def _manage_response(
self,
request_params: dict,
logging_obj: Any,
stream: bool = False,
input: Optional[Any] = None,
timeout: Optional[float] = None,
):
request_str = (
f"response = {request_params['method']}(\n"
f"\turl={request_params['url']},\n"
f"\tjson={request_params['json']},\n"
f")"
)
logging_obj.pre_call(
input=input,
api_key=request_params["headers"].get("Authorization"),
additional_args={
"complete_input_dict": request_params["json"],
"request_str": request_str,
},
)
if timeout:
request_params["timeout"] = timeout
try:
if stream:
resp = requests.request(
**request_params,
stream=True,
)
resp.raise_for_status()
yield resp
else:
resp = requests.request(**request_params)
resp.raise_for_status()
yield resp
except Exception as e:
raise WatsonXAIError(status_code=500, message=str(e))
if not stream:
logging_obj.post_call(
input=input,
api_key=request_params["headers"].get("Authorization"),
original_response=json.dumps(resp.json()),
additional_args={
"status_code": resp.status_code,
"complete_input_dict": request_params["json"],
},
)

View file

@ -63,6 +63,7 @@ from .llms import (
vertex_ai, vertex_ai,
vertex_ai_anthropic, vertex_ai_anthropic,
maritalk, maritalk,
watsonx,
) )
from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
from .llms.azure import AzureChatCompletion from .llms.azure import AzureChatCompletion
@ -360,7 +361,7 @@ def mock_completion(
model: str, model: str,
messages: List, messages: List,
stream: Optional[bool] = False, stream: Optional[bool] = False,
mock_response: str = "This is a mock request", mock_response: Union[str, Exception] = "This is a mock request",
logging=None, logging=None,
**kwargs, **kwargs,
): ):
@ -387,6 +388,20 @@ def mock_completion(
- If 'stream' is True, it returns a response that mimics the behavior of a streaming completion. - If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
""" """
try: try:
## LOGGING
if logging is not None:
logging.pre_call(
input=messages,
api_key="mock-key",
)
if isinstance(mock_response, Exception):
raise litellm.APIError(
status_code=500, # type: ignore
message=str(mock_response),
llm_provider="openai", # type: ignore
model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
)
model_response = ModelResponse(stream=stream) model_response = ModelResponse(stream=stream)
if stream is True: if stream is True:
# don't try to access stream object, # don't try to access stream object,
@ -1864,6 +1879,43 @@ def completion(
## RESPONSE OBJECT ## RESPONSE OBJECT
response = response response = response
elif custom_llm_provider == "watsonx":
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
response = watsonx.IBMWatsonXAI().completion(
model=model,
messages=messages,
custom_prompt_dict=custom_prompt_dict,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params, # type: ignore
logger_fn=logger_fn,
encoding=encoding,
logging_obj=logging,
timeout=timeout,
)
if (
"stream" in optional_params
and optional_params["stream"] == True
and not isinstance(response, CustomStreamWrapper)
):
# don't try to access stream object,
response = CustomStreamWrapper(
iter(response),
model,
custom_llm_provider="watsonx",
logging_obj=logging,
)
if optional_params.get("stream", False):
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
## RESPONSE OBJECT
response = response
elif custom_llm_provider == "vllm": elif custom_llm_provider == "vllm":
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
model_response = vllm.completion( model_response = vllm.completion(
@ -2943,6 +2995,15 @@ def embedding(
client=client, client=client,
aembedding=aembedding, aembedding=aembedding,
) )
elif custom_llm_provider == "watsonx":
response = watsonx.IBMWatsonXAI().embedding(
model=model,
input=input,
encoding=encoding,
logging_obj=logging,
optional_params=optional_params,
model_response=EmbeddingResponse(),
)
else: else:
args = locals() args = locals()
raise ValueError(f"No valid embedding model args passed in - {args}") raise ValueError(f"No valid embedding model args passed in - {args}")

View file

@ -1418,6 +1418,123 @@
"litellm_provider": "replicate", "litellm_provider": "replicate",
"mode": "chat" "mode": "chat"
}, },
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-instruct-v0.2": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.000001,
"litellm_provider": "replicate",
"mode": "chat"
},
"openrouter/openai/gpt-3.5-turbo": { "openrouter/openai/gpt-3.5-turbo": {
"max_tokens": 4095, "max_tokens": 4095,
"input_cost_per_token": 0.0000015, "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
"litellm_provider": "openrouter", "litellm_provider": "openrouter",
"mode": "chat" "mode": "chat"
}, },
"openrouter/anthropic/claude-3-opus": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"litellm_provider": "openrouter",
"mode": "chat",
"supports_function_calling": true,
"tool_use_system_prompt_tokens": 395
},
"openrouter/google/palm-2-chat-bison": { "openrouter/google/palm-2-chat-bison": {
"max_tokens": 8000, "max_tokens": 8000,
"input_cost_per_token": 0.0000005, "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
"litellm_provider": "bedrock", "litellm_provider": "bedrock",
"mode": "chat" "mode": "chat"
}, },
"meta.llama3-8b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-70b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77, "max_tokens": 77,
"max_input_tokens": 77, "max_input_tokens": 77,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);

View file

@ -1 +0,0 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]); (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);

View file

@ -1 +0,0 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);

View file

@ -1 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}(); !function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1,5 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82332,[\"127\",\"static/chunks/127-efd0436630e294eb.js\",\"931\",\"static/chunks/app/page-525d83925fd5350b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Csz8BqWx6JEoKsgLqCeCt\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html> <<<<<<< HEAD
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"761\",\"static/chunks/761-05f8a8451296476c.js\",\"931\",\"static/chunks/app/page-5a4a198eefedc775.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"c5rha8cqAah-saaczjn02\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
=======
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"386\",\"static/chunks/386-d811195b597a2122.js\",\"931\",\"static/chunks/app/page-e0ee34389254cdf2.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"dWGL92c5LzTMn7XX6utn2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)

View file

@ -1,7 +1,14 @@
2:I[77831,[],""] 2:I[77831,[],""]
3:I[82332,["127","static/chunks/127-efd0436630e294eb.js","931","static/chunks/app/page-525d83925fd5350b.js"],""] <<<<<<< HEAD
3:I[46414,["761","static/chunks/761-05f8a8451296476c.js","931","static/chunks/app/page-5a4a198eefedc775.js"],""]
4:I[5613,[],""] 4:I[5613,[],""]
5:I[31778,[],""] 5:I[31778,[],""]
0:["Csz8BqWx6JEoKsgLqCeCt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/60d9f441227ccc7e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]] 0:["c5rha8cqAah-saaczjn02",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
=======
3:I[46414,["386","static/chunks/386-d811195b597a2122.js","931","static/chunks/app/page-e0ee34389254cdf2.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["dWGL92c5LzTMn7XX6utn2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/9f51f0573c6b0365.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]] 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null 1:null

View file

@ -1,51 +1,15 @@
environment_variables:
SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
general_settings:
alerting:
- slack
alerting_threshold: 300
database_connection_pool_limit: 100
database_connection_timeout: 60
health_check_interval: 300
proxy_batch_write_at: 10
ui_access_mode: all
litellm_settings:
allowed_fails: 3
failure_callback:
- prometheus
fallbacks:
- gpt-3.5-turbo:
- fake-openai-endpoint
- gpt-4
num_retries: 3
service_callback:
- prometheus_system
success_callback:
- prometheus
model_list: model_list:
- litellm_params: - litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key api_key: my-fake-key
model: openai/my-fake-model model: openai/my-fake-model
model_name: fake-openai-endpoint model_name: fake-openai-endpoint
- litellm_params:
model: gpt-3.5-turbo
model_name: gpt-3.5-turbo
- model_name: llama-3
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
router_settings: router_settings:
allowed_fails: 3 num_retries: 0
context_window_fallbacks: null enable_pre_call_checks: true
cooldown_time: 1 redis_host: os.environ/REDIS_HOST
fallbacks: redis_password: os.environ/REDIS_PASSWORD
- gpt-3.5-turbo: redis_port: os.environ/REDIS_PORT
- fake-openai-endpoint
- gpt-4 litellm_settings:
- gpt-3.5-turbo-3: success_callback: ["openmeter"]
- fake-openai-endpoint
num_retries: 3
retry_after: 0
routing_strategy: simple-shuffle
routing_strategy_args: {}
timeout: 6000

View file

@ -422,6 +422,9 @@ class LiteLLM_ModelTable(LiteLLMBase):
created_by: str created_by: str
updated_by: str updated_by: str
class Config:
protected_namespaces = ()
class NewUserRequest(GenerateKeyRequest): class NewUserRequest(GenerateKeyRequest):
max_budget: Optional[float] = None max_budget: Optional[float] = None
@ -485,6 +488,9 @@ class TeamBase(LiteLLMBase):
class NewTeamRequest(TeamBase): class NewTeamRequest(TeamBase):
model_aliases: Optional[dict] = None model_aliases: Optional[dict] = None
class Config:
protected_namespaces = ()
class GlobalEndUsersSpend(LiteLLMBase): class GlobalEndUsersSpend(LiteLLMBase):
api_key: Optional[str] = None api_key: Optional[str] = None
@ -534,6 +540,9 @@ class LiteLLM_TeamTable(TeamBase):
budget_reset_at: Optional[datetime] = None budget_reset_at: Optional[datetime] = None
model_id: Optional[int] = None model_id: Optional[int] = None
class Config:
protected_namespaces = ()
@root_validator(pre=True) @root_validator(pre=True)
def set_model_info(cls, values): def set_model_info(cls, values):
dict_fields = [ dict_fields = [
@ -570,6 +579,9 @@ class LiteLLM_BudgetTable(LiteLLMBase):
model_max_budget: Optional[dict] = None model_max_budget: Optional[dict] = None
budget_duration: Optional[str] = None budget_duration: Optional[str] = None
class Config:
protected_namespaces = ()
class NewOrganizationRequest(LiteLLM_BudgetTable): class NewOrganizationRequest(LiteLLM_BudgetTable):
organization_id: Optional[str] = None organization_id: Optional[str] = None
@ -720,6 +732,10 @@ class ConfigGeneralSettings(LiteLLMBase):
None, None,
description="List of alerting types. By default it is all alerts", description="List of alerting types. By default it is all alerts",
) )
alert_to_webhook_url: Optional[Dict] = Field(
None,
description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
)
alerting_threshold: Optional[int] = Field( alerting_threshold: Optional[int] = Field(
None, None,
@ -896,5 +912,19 @@ class LiteLLM_SpendLogs(LiteLLMBase):
request_tags: Optional[Json] = None request_tags: Optional[Json] = None
class LiteLLM_ErrorLogs(LiteLLMBase):
request_id: Optional[str] = str(uuid.uuid4())
api_base: Optional[str] = ""
model_group: Optional[str] = ""
litellm_model_name: Optional[str] = ""
model_id: Optional[str] = ""
request_kwargs: Optional[dict] = {}
exception_type: Optional[str] = ""
status_code: Optional[str] = ""
exception_string: Optional[str] = ""
startTime: Union[str, datetime, None]
endTime: Union[str, datetime, None]
class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase): class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None

View file

@ -95,7 +95,15 @@ def common_checks(
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}" f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
) )
# 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget # 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
if litellm.max_budget > 0 and global_proxy_spend is not None: if (
litellm.max_budget > 0
and global_proxy_spend is not None
# only run global budget checks for OpenAI routes
# Reason - the Admin UI should continue working if the proxy crosses it's global budget
and route in LiteLLMRoutes.openai_routes.value
and route != "/v1/models"
and route != "/models"
):
if global_proxy_spend > litellm.max_budget: if global_proxy_spend > litellm.max_budget:
raise Exception( raise Exception(
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}" f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"

View file

@ -1059,8 +1059,18 @@ async def user_api_key_auth(
): ):
pass pass
else: else:
user_role = "unknown"
user_id = "unknown"
if user_id_information is not None and isinstance(
user_id_information, list
):
_user = user_id_information[0]
user_role = _user.get("user_role", {}).get(
"user_role", "unknown"
)
user_id = _user.get("user_id", "unknown")
raise Exception( raise Exception(
f"Only master key can be used to generate, delete, update info for new keys/users/teams. Route={route}" f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
) )
# check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions # check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
@ -1207,6 +1217,68 @@ def cost_tracking():
litellm.success_callback.append(_PROXY_track_cost_callback) # type: ignore litellm.success_callback.append(_PROXY_track_cost_callback) # type: ignore
async def _PROXY_failure_handler(
kwargs, # kwargs to completion
completion_response: litellm.ModelResponse, # response from completion
start_time=None,
end_time=None, # start/end time for completion
):
global prisma_client
if prisma_client is not None:
verbose_proxy_logger.debug(
"inside _PROXY_failure_handler kwargs=", extra=kwargs
)
_exception = kwargs.get("exception")
_exception_type = _exception.__class__.__name__
_model = kwargs.get("model", None)
_optional_params = kwargs.get("optional_params", {})
_optional_params = copy.deepcopy(_optional_params)
for k, v in _optional_params.items():
v = str(v)
v = v[:100]
_status_code = "500"
try:
_status_code = str(_exception.status_code)
except:
# Don't let this fail logging the exception to the dB
pass
_litellm_params = kwargs.get("litellm_params", {}) or {}
_metadata = _litellm_params.get("metadata", {}) or {}
_model_id = _metadata.get("model_info", {}).get("id", "")
_model_group = _metadata.get("model_group", "")
api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
_exception_string = str(_exception)[:500]
error_log = LiteLLM_ErrorLogs(
request_id=str(uuid.uuid4()),
model_group=_model_group,
model_id=_model_id,
litellm_model_name=kwargs.get("model"),
request_kwargs=_optional_params,
api_base=api_base,
exception_type=_exception_type,
status_code=_status_code,
exception_string=_exception_string,
startTime=kwargs.get("start_time"),
endTime=kwargs.get("end_time"),
)
# helper function to convert to dict on pydantic v2 & v1
error_log_dict = _get_pydantic_json_dict(error_log)
error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
await prisma_client.db.litellm_errorlogs.create(
data=error_log_dict # type: ignore
)
pass
async def _PROXY_track_cost_callback( async def _PROXY_track_cost_callback(
kwargs, # kwargs to completion kwargs, # kwargs to completion
completion_response: litellm.ModelResponse, # response from completion completion_response: litellm.ModelResponse, # response from completion
@ -1292,6 +1364,15 @@ async def _PROXY_track_cost_callback(
verbose_proxy_logger.debug("error in tracking cost callback - %s", e) verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
def error_tracking():
global prisma_client, custom_db_client
if prisma_client is not None or custom_db_client is not None:
if isinstance(litellm.failure_callback, list):
verbose_proxy_logger.debug("setting litellm failure callback to track cost")
if (_PROXY_failure_handler) not in litellm.failure_callback: # type: ignore
litellm.failure_callback.append(_PROXY_failure_handler) # type: ignore
def _set_spend_logs_payload( def _set_spend_logs_payload(
payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
): ):
@ -2612,9 +2693,10 @@ class ProxyConfig:
environment_variables = config_data.get("environment_variables", {}) environment_variables = config_data.get("environment_variables", {})
for k, v in environment_variables.items(): for k, v in environment_variables.items():
try: try:
decoded_b64 = base64.b64decode(v) if v is not None:
value = decrypt_value(value=decoded_b64, master_key=master_key) # type: ignore decoded_b64 = base64.b64decode(v)
os.environ[k] = value value = decrypt_value(value=decoded_b64, master_key=master_key) # type: ignore
os.environ[k] = value
except Exception as e: except Exception as e:
verbose_proxy_logger.error( verbose_proxy_logger.error(
"Error setting env variable: %s - %s", k, str(e) "Error setting env variable: %s - %s", k, str(e)
@ -2632,9 +2714,17 @@ class ProxyConfig:
if "alert_types" in _general_settings: if "alert_types" in _general_settings:
general_settings["alert_types"] = _general_settings["alert_types"] general_settings["alert_types"] = _general_settings["alert_types"]
proxy_logging_obj.alert_types = general_settings["alert_types"] proxy_logging_obj.alert_types = general_settings["alert_types"]
proxy_logging_obj.slack_alerting_instance.alert_types = general_settings[ proxy_logging_obj.slack_alerting_instance.update_values(
"alert_types" alert_types=general_settings["alert_types"]
)
if "alert_to_webhook_url" in _general_settings:
general_settings["alert_to_webhook_url"] = _general_settings[
"alert_to_webhook_url"
] ]
proxy_logging_obj.slack_alerting_instance.update_values(
alert_to_webhook_url=general_settings["alert_to_webhook_url"]
)
# router settings # router settings
if llm_router is not None and prisma_client is not None: if llm_router is not None and prisma_client is not None:
@ -3176,6 +3266,9 @@ async def startup_event():
## COST TRACKING ## ## COST TRACKING ##
cost_tracking() cost_tracking()
## Error Tracking ##
error_tracking()
db_writer_client = HTTPHandler() db_writer_client = HTTPHandler()
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
@ -3655,6 +3748,17 @@ async def chat_completion(
if data["model"] in litellm.model_alias_map: if data["model"] in litellm.model_alias_map:
data["model"] = litellm.model_alias_map[data["model"]] data["model"] = litellm.model_alias_map[data["model"]]
## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
data["litellm_call_id"] = str(uuid.uuid4())
logging_obj, data = litellm.utils.function_setup(
original_function="acompletion",
rules_obj=litellm.utils.Rules(),
start_time=datetime.now(),
**data,
)
data["litellm_logging_obj"] = logging_obj
### CALL HOOKS ### - modify incoming data before calling the model ### CALL HOOKS ### - modify incoming data before calling the model
data = await proxy_logging_obj.pre_call_hook( data = await proxy_logging_obj.pre_call_hook(
user_api_key_dict=user_api_key_dict, data=data, call_type="completion" user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
@ -7421,9 +7525,9 @@ async def model_info_v2(
) )
async def model_metrics( async def model_metrics(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = None, _selected_model_group: Optional[str] = "gpt-4-32k",
startTime: Optional[datetime] = datetime.now() - timedelta(days=30), startTime: Optional[datetime] = None,
endTime: Optional[datetime] = datetime.now(), endTime: Optional[datetime] = None,
): ):
global prisma_client, llm_router global prisma_client, llm_router
if prisma_client is None: if prisma_client is None:
@ -7433,65 +7537,214 @@ async def model_metrics(
param="None", param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR, code=status.HTTP_500_INTERNAL_SERVER_ERROR,
) )
if _selected_model_group and llm_router is not None: startTime = startTime or datetime.now() - timedelta(days=30)
_model_list = llm_router.get_model_list() endTime = endTime or datetime.now()
_relevant_api_bases = []
for model in _model_list: sql_query = """
if model["model_name"] == _selected_model_group: SELECT
_litellm_params = model["litellm_params"] api_base,
_api_base = _litellm_params.get("api_base", "") model,
_relevant_api_bases.append(_api_base) DATE_TRUNC('day', "startTime")::DATE AS day,
_relevant_api_bases.append(_api_base + "/openai/") AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) / SUM(total_tokens) AS avg_latency_per_token
FROM
"LiteLLM_SpendLogs"
WHERE
"startTime" >= NOW() - INTERVAL '30 days'
AND "model" = $1 AND "cache_hit" != 'True'
GROUP BY
api_base,
model,
day
HAVING
SUM(total_tokens) > 0
ORDER BY
avg_latency_per_token DESC;
"""
_all_api_bases = set()
db_response = await prisma_client.db.query_raw(
sql_query, _selected_model_group, startTime, endTime
)
_daily_entries: dict = {} # {"Jun 23": {"model1": 0.002, "model2": 0.003}}
if db_response is not None:
for model_data in db_response:
_api_base = model_data["api_base"]
_model = model_data["model"]
_day = model_data["day"]
_avg_latency_per_token = model_data["avg_latency_per_token"]
if _day not in _daily_entries:
_daily_entries[_day] = {}
_combined_model_name = str(_model)
if "https://" in _api_base:
_combined_model_name = str(_api_base)
if "/openai/" in _combined_model_name:
_combined_model_name = _combined_model_name.split("/openai/")[0]
_all_api_bases.add(_combined_model_name)
_daily_entries[_day][_combined_model_name] = _avg_latency_per_token
sql_query = """
SELECT
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
COUNT(*) AS num_requests,
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
FROM "LiteLLM_SpendLogs"
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
AND api_base = ANY($3)
GROUP BY CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
ORDER BY num_requests DESC
LIMIT 50;
""" """
each entry needs to be like this:
{
date: 'Jun 23',
'gpt-4-https://api.openai.com/v1/': 0.002,
'gpt-43-https://api.openai.com-12/v1/': 0.002,
}
"""
# convert daily entries to list of dicts
db_response = await prisma_client.db.query_raw( response: List[dict] = []
sql_query, startTime, endTime, _relevant_api_bases
# sort daily entries by date
_daily_entries = dict(sorted(_daily_entries.items(), key=lambda item: item[0]))
for day in _daily_entries:
entry = {"date": str(day)}
for model_key, latency in _daily_entries[day].items():
entry[model_key] = latency
response.append(entry)
return {
"data": response,
"all_api_bases": list(_all_api_bases),
}
@router.get(
"/model/metrics/slow_responses",
description="View number of hanging requests per model_group",
tags=["model management"],
include_in_schema=False,
dependencies=[Depends(user_api_key_auth)],
)
async def model_metrics_slow_responses(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = "gpt-4-32k",
startTime: Optional[datetime] = None,
endTime: Optional[datetime] = None,
):
global prisma_client, llm_router, proxy_logging_obj
if prisma_client is None:
raise ProxyException(
message="Prisma Client is not initialized",
type="internal_error",
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
) )
else: startTime = startTime or datetime.now() - timedelta(days=30)
endTime = endTime or datetime.now()
sql_query = """ alerting_threshold = (
SELECT proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base, )
COUNT(*) AS num_requests, alerting_threshold = int(alerting_threshold)
AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
FROM sql_query = """
"LiteLLM_SpendLogs" SELECT
api_base,
COUNT(*) AS total_count,
SUM(CASE
WHEN ("endTime" - "startTime") >= (INTERVAL '1 SECOND' * CAST($1 AS INTEGER)) THEN 1
ELSE 0
END) AS slow_count
FROM
"LiteLLM_SpendLogs"
WHERE
"model" = $2
AND "cache_hit" != 'True'
GROUP BY
api_base
ORDER BY
slow_count DESC;
"""
db_response = await prisma_client.db.query_raw(
sql_query, alerting_threshold, _selected_model_group
)
if db_response is not None:
for row in db_response:
_api_base = row.get("api_base") or ""
if "/openai/" in _api_base:
_api_base = _api_base.split("/openai/")[0]
row["api_base"] = _api_base
return db_response
@router.get(
"/model/metrics/exceptions",
description="View number of failed requests per model on config.yaml",
tags=["model management"],
include_in_schema=False,
dependencies=[Depends(user_api_key_auth)],
)
async def model_metrics_exceptions(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
_selected_model_group: Optional[str] = None,
startTime: Optional[datetime] = None,
endTime: Optional[datetime] = None,
):
global prisma_client, llm_router
if prisma_client is None:
raise ProxyException(
message="Prisma Client is not initialized",
type="internal_error",
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
startTime = startTime or datetime.now() - timedelta(days=30)
endTime = endTime or datetime.now()
"""
"""
sql_query = """
WITH cte AS (
SELECT
CASE WHEN api_base = '' THEN litellm_model_name ELSE CONCAT(litellm_model_name, '-', api_base) END AS combined_model_api_base,
exception_type,
COUNT(*) AS num_exceptions
FROM "LiteLLM_ErrorLogs"
WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
GROUP BY GROUP BY combined_model_api_base, exception_type
CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END )
ORDER BY SELECT
num_requests DESC combined_model_api_base,
LIMIT 50; COUNT(*) AS total_exceptions,
""" json_object_agg(exception_type, num_exceptions) AS exception_counts
FROM cte
db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime) GROUP BY combined_model_api_base
ORDER BY total_exceptions DESC
LIMIT 200;
"""
db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
response: List[dict] = [] response: List[dict] = []
if response is not None: exception_types = set()
"""
Return Data
{
"combined_model_api_base": "gpt-3.5-turbo-https://api.openai.com/v1/,
"total_exceptions": 5,
"BadRequestException": 5,
"TimeoutException": 2
}
"""
if db_response is not None:
# loop through all models # loop through all models
for model_data in db_response: for model_data in db_response:
model = model_data.get("combined_model_api_base", "") model = model_data.get("combined_model_api_base", "")
num_requests = model_data.get("num_requests", 0) total_exceptions = model_data.get("total_exceptions", 0)
avg_latency_seconds = model_data.get("avg_latency_seconds", 0) exception_counts = model_data.get("exception_counts", {})
response.append( curr_row = {
{ "model": model,
"model": model, "total_exceptions": total_exceptions,
"num_requests": num_requests, }
"avg_latency_seconds": avg_latency_seconds, curr_row.update(exception_counts)
} response.append(curr_row)
) for k, v in exception_counts.items():
return response exception_types.add(k)
return {"data": response, "exception_types": list(exception_types)}
@router.get( @router.get(
@ -8453,6 +8706,13 @@ async def update_config(config_info: ConfigYAML):
_existing_settings = config["general_settings"] _existing_settings = config["general_settings"]
for k, v in updated_general_settings.items(): for k, v in updated_general_settings.items():
# overwrite existing settings with updated values # overwrite existing settings with updated values
if k == "alert_to_webhook_url":
# check if slack is already enabled. if not, enable it
if "slack" not in _existing_settings:
if "alerting" not in _existing_settings:
_existing_settings["alerting"] = ["slack"]
elif isinstance(_existing_settings["alerting"], list):
_existing_settings["alerting"].append("slack")
_existing_settings[k] = v _existing_settings[k] = v
config["general_settings"] = _existing_settings config["general_settings"] = _existing_settings
@ -8567,7 +8827,25 @@ async def get_config():
""" """
for _callback in _success_callbacks: for _callback in _success_callbacks:
if _callback == "langfuse": if _callback == "openmeter":
env_vars = [
"OPENMETER_API_KEY",
]
env_vars_dict = {}
for _var in env_vars:
env_variable = environment_variables.get(_var, None)
if env_variable is None:
env_vars_dict[_var] = None
else:
# decode + decrypt the value
decoded_b64 = base64.b64decode(env_variable)
_decrypted_value = decrypt_value(
value=decoded_b64, master_key=master_key
)
env_vars_dict[_var] = _decrypted_value
_data_to_return.append({"name": _callback, "variables": env_vars_dict})
elif _callback == "langfuse":
_langfuse_vars = [ _langfuse_vars = [
"LANGFUSE_PUBLIC_KEY", "LANGFUSE_PUBLIC_KEY",
"LANGFUSE_SECRET_KEY", "LANGFUSE_SECRET_KEY",
@ -8592,6 +8870,7 @@ async def get_config():
# Check if slack alerting is on # Check if slack alerting is on
_alerting = _general_settings.get("alerting", []) _alerting = _general_settings.get("alerting", [])
alerting_data = []
if "slack" in _alerting: if "slack" in _alerting:
_slack_vars = [ _slack_vars = [
"SLACK_WEBHOOK_URL", "SLACK_WEBHOOK_URL",
@ -8600,7 +8879,8 @@ async def get_config():
for _var in _slack_vars: for _var in _slack_vars:
env_variable = environment_variables.get(_var, None) env_variable = environment_variables.get(_var, None)
if env_variable is None: if env_variable is None:
_slack_env_vars[_var] = None _value = os.getenv("SLACK_WEBHOOK_URL", None)
_slack_env_vars[_var] = _value
else: else:
# decode + decrypt the value # decode + decrypt the value
decoded_b64 = base64.b64decode(env_variable) decoded_b64 = base64.b64decode(env_variable)
@ -8613,19 +8893,23 @@ async def get_config():
_all_alert_types = ( _all_alert_types = (
proxy_logging_obj.slack_alerting_instance._all_possible_alert_types() proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
) )
_data_to_return.append( _alerts_to_webhook = (
proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
)
alerting_data.append(
{ {
"name": "slack", "name": "slack",
"variables": _slack_env_vars, "variables": _slack_env_vars,
"alerting_types": _alerting_types, "active_alerts": _alerting_types,
"all_alert_types": _all_alert_types, "alerts_to_webhook": _alerts_to_webhook,
} }
) )
_router_settings = llm_router.get_settings() _router_settings = llm_router.get_settings()
return { return {
"status": "success", "status": "success",
"data": _data_to_return, "callbacks": _data_to_return,
"alerts": alerting_data,
"router_settings": _router_settings, "router_settings": _router_settings,
} }
except Exception as e: except Exception as e:
@ -8701,9 +8985,9 @@ async def test_endpoint(request: Request):
) )
async def health_services_endpoint( async def health_services_endpoint(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
service: Literal["slack_budget_alerts", "langfuse", "slack"] = fastapi.Query( service: Literal[
description="Specify the service being hit." "slack_budget_alerts", "langfuse", "slack", "openmeter"
), ] = fastapi.Query(description="Specify the service being hit."),
): ):
""" """
Hidden endpoint. Hidden endpoint.
@ -8717,7 +9001,7 @@ async def health_services_endpoint(
raise HTTPException( raise HTTPException(
status_code=400, detail={"error": "Service must be specified."} status_code=400, detail={"error": "Service must be specified."}
) )
if service not in ["slack_budget_alerts", "langfuse", "slack"]: if service not in ["slack_budget_alerts", "langfuse", "slack", "openmeter"]:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
detail={ detail={
@ -8725,6 +9009,18 @@ async def health_services_endpoint(
}, },
) )
if service == "openmeter":
_ = await litellm.acompletion(
model="openai/litellm-mock-response-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
user="litellm:/health/services",
mock_response="This is a mock response",
)
return {
"status": "success",
"message": "Mock LLM request made - check openmeter.",
}
if service == "langfuse": if service == "langfuse":
from litellm.integrations.langfuse import LangFuseLogger from litellm.integrations.langfuse import LangFuseLogger
@ -8741,27 +9037,73 @@ async def health_services_endpoint(
"message": "Mock LLM request made - check langfuse.", "message": "Mock LLM request made - check langfuse.",
} }
if "slack" in general_settings.get("alerting", []): if service == "slack" or service == "slack_budget_alerts":
test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700""" if "slack" in general_settings.get("alerting", []):
await proxy_logging_obj.alerting_handler(message=test_message, level="Low") # test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
return { # check if user has opted into unique_alert_webhooks
"status": "success", if (
"message": "Mock Slack Alert sent, verify Slack Alert Received on your channel", proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
} is not None
else: ):
raise HTTPException( for (
status_code=422, alert_type
detail={ ) in proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url:
"error": '"slack" not in proxy config: general_settings. Unable to test this.' """
}, "llm_exceptions",
) "llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
"""
# only test alert if it's in active alert types
if (
proxy_logging_obj.slack_alerting_instance.alert_types
is not None
and alert_type
not in proxy_logging_obj.slack_alerting_instance.alert_types
):
continue
test_message = "default test message"
if alert_type == "llm_exceptions":
test_message = f"LLM Exception test alert"
elif alert_type == "llm_too_slow":
test_message = f"LLM Too Slow test alert"
elif alert_type == "llm_requests_hanging":
test_message = f"LLM Requests Hanging test alert"
elif alert_type == "budget_alerts":
test_message = f"Budget Alert test alert"
elif alert_type == "db_exceptions":
test_message = f"DB Exception test alert"
await proxy_logging_obj.alerting_handler(
message=test_message, level="Low", alert_type=alert_type
)
else:
await proxy_logging_obj.alerting_handler(
message="This is a test slack alert message",
level="Low",
alert_type="budget_alerts",
)
return {
"status": "success",
"message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
}
else:
raise HTTPException(
status_code=422,
detail={
"error": '"{}" not in proxy config: general_settings. Unable to test this.'.format(
service
)
},
)
except Exception as e: except Exception as e:
if isinstance(e, HTTPException): if isinstance(e, HTTPException):
raise ProxyException( raise ProxyException(
message=getattr(e, "detail", f"Authentication Error({str(e)})"), message=getattr(e, "detail", f"Authentication Error({str(e)})"),
type="auth_error", type="auth_error",
param=getattr(e, "param", "None"), param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_401_UNAUTHORIZED), code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
) )
elif isinstance(e, ProxyException): elif isinstance(e, ProxyException):
raise e raise e
@ -8769,7 +9111,7 @@ async def health_services_endpoint(
message="Authentication Error, " + str(e), message="Authentication Error, " + str(e),
type="auth_error", type="auth_error",
param=getattr(e, "param", "None"), param=getattr(e, "param", "None"),
code=status.HTTP_401_UNAUTHORIZED, code=status.HTTP_500_INTERNAL_SERVER_ERROR,
) )

View file

@ -183,6 +183,21 @@ model LiteLLM_SpendLogs {
end_user String? end_user String?
} }
// View spend, model, api_key per request
model LiteLLM_ErrorLogs {
request_id String @id @default(uuid())
startTime DateTime // Assuming start_time is a DateTime field
endTime DateTime // Assuming end_time is a DateTime field
api_base String @default("")
model_group String @default("") // public model_name / model_group
litellm_model_name String @default("") // model passed to litellm
model_id String @default("") // ID of model in ProxyModelTable
request_kwargs Json @default("{}")
exception_type String @default("")
exception_string String @default("")
status_code String @default("")
}
// Beta - allow team members to request access to a model // Beta - allow team members to request access to a model
model LiteLLM_UserNotifications { model LiteLLM_UserNotifications {
request_id String @id request_id String @id

View file

@ -1,6 +1,6 @@
from typing import Optional, List, Any, Literal, Union from typing import Optional, List, Any, Literal, Union
import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
import litellm, backoff import litellm, backoff, traceback
from litellm.proxy._types import ( from litellm.proxy._types import (
UserAPIKeyAuth, UserAPIKeyAuth,
DynamoDBArgs, DynamoDBArgs,
@ -199,6 +199,33 @@ class ProxyLogging:
print_verbose(f"final data being sent to {call_type} call: {data}") print_verbose(f"final data being sent to {call_type} call: {data}")
return data return data
except Exception as e: except Exception as e:
if "litellm_logging_obj" in data:
logging_obj: litellm.utils.Logging = data["litellm_logging_obj"]
## ASYNC FAILURE HANDLER ##
error_message = ""
if isinstance(e, HTTPException):
if isinstance(e.detail, str):
error_message = e.detail
elif isinstance(e.detail, dict):
error_message = json.dumps(e.detail)
else:
error_message = str(e)
else:
error_message = str(e)
error_raised = Exception(f"{error_message}")
await logging_obj.async_failure_handler(
exception=error_raised,
traceback_exception=traceback.format_exc(),
)
## SYNC FAILURE HANDLER ##
try:
logging_obj.failure_handler(
error_raised, traceback.format_exc()
) # DO NOT MAKE THREADED - router retry fallback relies on this!
except Exception as error_val:
pass
raise e raise e
async def during_call_hook( async def during_call_hook(
@ -256,7 +283,16 @@ class ProxyLogging:
) )
async def alerting_handler( async def alerting_handler(
self, message: str, level: Literal["Low", "Medium", "High"] self,
message: str,
level: Literal["Low", "Medium", "High"],
alert_type: Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
],
): ):
""" """
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298 Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -289,7 +325,7 @@ class ProxyLogging:
for client in self.alerting: for client in self.alerting:
if client == "slack": if client == "slack":
await self.slack_alerting_instance.send_alert( await self.slack_alerting_instance.send_alert(
message=message, level=level message=message, level=level, alert_type=alert_type
) )
elif client == "sentry": elif client == "sentry":
if litellm.utils.sentry_sdk_instance is not None: if litellm.utils.sentry_sdk_instance is not None:
@ -323,6 +359,7 @@ class ProxyLogging:
self.alerting_handler( self.alerting_handler(
message=f"DB read/write call failed: {error_message}", message=f"DB read/write call failed: {error_message}",
level="High", level="High",
alert_type="db_exceptions",
) )
) )
@ -354,7 +391,9 @@ class ProxyLogging:
return return
asyncio.create_task( asyncio.create_task(
self.alerting_handler( self.alerting_handler(
message=f"LLM API call failed: {str(original_exception)}", level="High" message=f"LLM API call failed: {str(original_exception)}",
level="High",
alert_type="llm_exceptions",
) )
) )
@ -1738,7 +1777,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
usage = response_obj["usage"] usage = response_obj["usage"]
if type(usage) == litellm.Usage: if type(usage) == litellm.Usage:
usage = dict(usage) usage = dict(usage)
id = response_obj.get("id", str(uuid.uuid4())) id = response_obj.get("id", kwargs.get("litellm_call_id"))
api_key = metadata.get("user_api_key", "") api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"): if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
# hash the api_key # hash the api_key
@ -2010,6 +2049,11 @@ async def update_spend(
raise e raise e
### UPDATE KEY TABLE ### ### UPDATE KEY TABLE ###
verbose_proxy_logger.debug(
"KEY Spend transactions: {}".format(
len(prisma_client.key_list_transactons.keys())
)
)
if len(prisma_client.key_list_transactons.keys()) > 0: if len(prisma_client.key_list_transactons.keys()) > 0:
for i in range(n_retry_times + 1): for i in range(n_retry_times + 1):
start_time = time.time() start_time = time.time()

View file

@ -50,7 +50,6 @@ class Router:
model_names: List = [] model_names: List = []
cache_responses: Optional[bool] = False cache_responses: Optional[bool] = False
default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour default_cache_time_seconds: int = 1 * 60 * 60 # 1 hour
num_retries: int = 0
tenacity = None tenacity = None
leastbusy_logger: Optional[LeastBusyLoggingHandler] = None leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
@ -70,9 +69,11 @@ class Router:
] = None, # if you want to cache across model groups ] = None, # if you want to cache across model groups
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
## RELIABILITY ## ## RELIABILITY ##
num_retries: int = 0, num_retries: Optional[int] = None,
timeout: Optional[float] = None, timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create default_litellm_params: Optional[
dict
] = None, # default params for Router.chat.completion.create
default_max_parallel_requests: Optional[int] = None, default_max_parallel_requests: Optional[int] = None,
set_verbose: bool = False, set_verbose: bool = False,
debug_level: Literal["DEBUG", "INFO"] = "INFO", debug_level: Literal["DEBUG", "INFO"] = "INFO",
@ -158,6 +159,7 @@ class Router:
router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}]) router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
``` ```
""" """
if semaphore: if semaphore:
self.semaphore = semaphore self.semaphore = semaphore
self.set_verbose = set_verbose self.set_verbose = set_verbose
@ -229,7 +231,14 @@ class Router:
self.failed_calls = ( self.failed_calls = (
InMemoryCache() InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown ) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
self.num_retries = num_retries or litellm.num_retries or 0
if num_retries is not None:
self.num_retries = num_retries
elif litellm.num_retries is not None:
self.num_retries = litellm.num_retries
else:
self.num_retries = openai.DEFAULT_MAX_RETRIES
self.timeout = timeout or litellm.request_timeout self.timeout = timeout or litellm.request_timeout
self.retry_after = retry_after self.retry_after = retry_after
@ -255,6 +264,7 @@ class Router:
) # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group ) # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group
# make Router.chat.completions.create compatible for openai.chat.completions.create # make Router.chat.completions.create compatible for openai.chat.completions.create
default_litellm_params = default_litellm_params or {}
self.chat = litellm.Chat(params=default_litellm_params, router_obj=self) self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)
# default litellm args # default litellm args
@ -280,6 +290,21 @@ class Router:
} }
""" """
### ROUTING SETUP ### ### ROUTING SETUP ###
self.routing_strategy_init(
routing_strategy=routing_strategy,
routing_strategy_args=routing_strategy_args,
)
## COOLDOWNS ##
if isinstance(litellm.failure_callback, list):
litellm.failure_callback.append(self.deployment_callback_on_failure)
else:
litellm.failure_callback = [self.deployment_callback_on_failure]
print( # noqa
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
) # noqa
self.routing_strategy_args = routing_strategy_args
def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
if routing_strategy == "least-busy": if routing_strategy == "least-busy":
self.leastbusy_logger = LeastBusyLoggingHandler( self.leastbusy_logger = LeastBusyLoggingHandler(
router_cache=self.cache, model_list=self.model_list router_cache=self.cache, model_list=self.model_list
@ -311,15 +336,6 @@ class Router:
) )
if isinstance(litellm.callbacks, list): if isinstance(litellm.callbacks, list):
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
## COOLDOWNS ##
if isinstance(litellm.failure_callback, list):
litellm.failure_callback.append(self.deployment_callback_on_failure)
else:
litellm.failure_callback = [self.deployment_callback_on_failure]
verbose_router_logger.info(
f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
)
self.routing_strategy_args = routing_strategy_args
def print_deployment(self, deployment: dict): def print_deployment(self, deployment: dict):
""" """
@ -428,6 +444,7 @@ class Router:
kwargs["messages"] = messages kwargs["messages"] = messages
kwargs["original_function"] = self._acompletion kwargs["original_function"] = self._acompletion
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries) kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
timeout = kwargs.get("request_timeout", self.timeout) timeout = kwargs.get("request_timeout", self.timeout)
kwargs.setdefault("metadata", {}).update({"model_group": model}) kwargs.setdefault("metadata", {}).update({"model_group": model})
@ -469,6 +486,7 @@ class Router:
) )
kwargs["model_info"] = deployment.get("model_info", {}) kwargs["model_info"] = deployment.get("model_info", {})
data = deployment["litellm_params"].copy() data = deployment["litellm_params"].copy()
model_name = data["model"] model_name = data["model"]
for k, v in self.default_litellm_params.items(): for k, v in self.default_litellm_params.items():
if ( if (
@ -1415,10 +1433,12 @@ class Router:
context_window_fallbacks = kwargs.pop( context_window_fallbacks = kwargs.pop(
"context_window_fallbacks", self.context_window_fallbacks "context_window_fallbacks", self.context_window_fallbacks
) )
verbose_router_logger.debug(
f"async function w/ retries: original_function - {original_function}"
)
num_retries = kwargs.pop("num_retries") num_retries = kwargs.pop("num_retries")
verbose_router_logger.debug(
f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
)
try: try:
# if the function call is successful, no exception will be raised and we'll break out of the loop # if the function call is successful, no exception will be raised and we'll break out of the loop
response = await original_function(*args, **kwargs) response = await original_function(*args, **kwargs)
@ -1436,37 +1456,47 @@ class Router:
raise original_exception raise original_exception
### RETRY ### RETRY
#### check if it should retry + back-off if required #### check if it should retry + back-off if required
if "No models available" in str(e): # if "No models available" in str(
timeout = litellm._calculate_retry_after( # e
remaining_retries=num_retries, # ) or RouterErrors.no_deployments_available.value in str(e):
max_retries=num_retries, # timeout = litellm._calculate_retry_after(
min_timeout=self.retry_after, # remaining_retries=num_retries,
) # max_retries=num_retries,
await asyncio.sleep(timeout) # min_timeout=self.retry_after,
elif RouterErrors.user_defined_ratelimit_error.value in str(e): # )
raise e # don't wait to retry if deployment hits user-defined rate-limit # await asyncio.sleep(timeout)
elif hasattr(original_exception, "status_code") and litellm._should_retry( # elif RouterErrors.user_defined_ratelimit_error.value in str(e):
status_code=original_exception.status_code # raise e # don't wait to retry if deployment hits user-defined rate-limit
):
if hasattr(original_exception, "response") and hasattr(
original_exception.response, "headers"
):
timeout = litellm._calculate_retry_after(
remaining_retries=num_retries,
max_retries=num_retries,
response_headers=original_exception.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=num_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
await asyncio.sleep(timeout)
else:
raise original_exception
# elif hasattr(original_exception, "status_code") and litellm._should_retry(
# status_code=original_exception.status_code
# ):
# if hasattr(original_exception, "response") and hasattr(
# original_exception.response, "headers"
# ):
# timeout = litellm._calculate_retry_after(
# remaining_retries=num_retries,
# max_retries=num_retries,
# response_headers=original_exception.response.headers,
# min_timeout=self.retry_after,
# )
# else:
# timeout = litellm._calculate_retry_after(
# remaining_retries=num_retries,
# max_retries=num_retries,
# min_timeout=self.retry_after,
# )
# await asyncio.sleep(timeout)
# else:
# raise original_exception
### RETRY
_timeout = self._router_should_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
)
await asyncio.sleep(_timeout)
## LOGGING ## LOGGING
if num_retries > 0: if num_retries > 0:
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
@ -1488,34 +1518,12 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=e) kwargs = self.log_retry(kwargs=kwargs, e=e)
remaining_retries = num_retries - current_attempt remaining_retries = num_retries - current_attempt
if "No models available" in str(e): _timeout = self._router_should_retry(
timeout = litellm._calculate_retry_after( e=original_exception,
remaining_retries=remaining_retries, remaining_retries=remaining_retries,
max_retries=num_retries, num_retries=num_retries,
min_timeout=self.retry_after, )
) await asyncio.sleep(_timeout)
await asyncio.sleep(timeout)
elif (
hasattr(e, "status_code")
and hasattr(e, "response")
and litellm._should_retry(status_code=e.status_code)
):
if hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
await asyncio.sleep(timeout)
else:
raise e
raise original_exception raise original_exception
def function_with_fallbacks(self, *args, **kwargs): def function_with_fallbacks(self, *args, **kwargs):
@ -1606,6 +1614,27 @@ class Router:
raise e raise e
raise original_exception raise original_exception
def _router_should_retry(
self, e: Exception, remaining_retries: int, num_retries: int
) -> Union[int, float]:
"""
Calculate back-off, then retry
"""
if hasattr(e, "response") and hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
return timeout
def function_with_retries(self, *args, **kwargs): def function_with_retries(self, *args, **kwargs):
""" """
Try calling the model 3 times. Shuffle between available deployments. Try calling the model 3 times. Shuffle between available deployments.
@ -1619,15 +1648,13 @@ class Router:
context_window_fallbacks = kwargs.pop( context_window_fallbacks = kwargs.pop(
"context_window_fallbacks", self.context_window_fallbacks "context_window_fallbacks", self.context_window_fallbacks
) )
try: try:
# if the function call is successful, no exception will be raised and we'll break out of the loop # if the function call is successful, no exception will be raised and we'll break out of the loop
response = original_function(*args, **kwargs) response = original_function(*args, **kwargs)
return response return response
except Exception as e: except Exception as e:
original_exception = e original_exception = e
verbose_router_logger.debug(
f"num retries in function with retries: {num_retries}"
)
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
if ( if (
isinstance(original_exception, litellm.ContextWindowExceededError) isinstance(original_exception, litellm.ContextWindowExceededError)
@ -1641,6 +1668,12 @@ class Router:
if num_retries > 0: if num_retries > 0:
kwargs = self.log_retry(kwargs=kwargs, e=original_exception) kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
### RETRY ### RETRY
_timeout = self._router_should_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
)
time.sleep(_timeout)
for current_attempt in range(num_retries): for current_attempt in range(num_retries):
verbose_router_logger.debug( verbose_router_logger.debug(
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}" f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
@ -1654,34 +1687,12 @@ class Router:
## LOGGING ## LOGGING
kwargs = self.log_retry(kwargs=kwargs, e=e) kwargs = self.log_retry(kwargs=kwargs, e=e)
remaining_retries = num_retries - current_attempt remaining_retries = num_retries - current_attempt
if "No models available" in str(e): _timeout = self._router_should_retry(
timeout = litellm._calculate_retry_after( e=e,
remaining_retries=remaining_retries, remaining_retries=remaining_retries,
max_retries=num_retries, num_retries=num_retries,
min_timeout=self.retry_after, )
) time.sleep(_timeout)
time.sleep(timeout)
elif (
hasattr(e, "status_code")
and hasattr(e, "response")
and litellm._should_retry(status_code=e.status_code)
):
if hasattr(e.response, "headers"):
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
response_headers=e.response.headers,
min_timeout=self.retry_after,
)
else:
timeout = litellm._calculate_retry_after(
remaining_retries=remaining_retries,
max_retries=num_retries,
min_timeout=self.retry_after,
)
time.sleep(timeout)
else:
raise e
raise original_exception raise original_exception
### HELPER FUNCTIONS ### HELPER FUNCTIONS
@ -1715,10 +1726,11 @@ class Router:
) # i.e. azure ) # i.e. azure
metadata = kwargs.get("litellm_params", {}).get("metadata", None) metadata = kwargs.get("litellm_params", {}).get("metadata", None)
_model_info = kwargs.get("litellm_params", {}).get("model_info", {}) _model_info = kwargs.get("litellm_params", {}).get("model_info", {})
if isinstance(_model_info, dict): if isinstance(_model_info, dict):
deployment_id = _model_info.get("id", None) deployment_id = _model_info.get("id", None)
self._set_cooldown_deployments( self._set_cooldown_deployments(
deployment_id exception_status=exception_status, deployment=deployment_id
) # setting deployment_id in cooldown deployments ) # setting deployment_id in cooldown deployments
if custom_llm_provider: if custom_llm_provider:
model_name = f"{custom_llm_provider}/{model_name}" model_name = f"{custom_llm_provider}/{model_name}"
@ -1778,9 +1790,15 @@ class Router:
key=rpm_key, value=request_count, local_only=True key=rpm_key, value=request_count, local_only=True
) # don't change existing ttl ) # don't change existing ttl
def _set_cooldown_deployments(self, deployment: Optional[str] = None): def _set_cooldown_deployments(
self, exception_status: Union[str, int], deployment: Optional[str] = None
):
""" """
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
or
the exception is not one that should be immediately retried (e.g. 401)
""" """
if deployment is None: if deployment is None:
return return
@ -1797,7 +1815,20 @@ class Router:
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
) )
cooldown_time = self.cooldown_time or 1 cooldown_time = self.cooldown_time or 1
if updated_fails > self.allowed_fails:
if isinstance(exception_status, str):
try:
exception_status = int(exception_status)
except Exception as e:
verbose_router_logger.debug(
"Unable to cast exception status to int {}. Defaulting to status=500.".format(
exception_status
)
)
exception_status = 500
_should_retry = litellm._should_retry(status_code=exception_status)
if updated_fails > self.allowed_fails or _should_retry == False:
# get the current cooldown list for that minute # get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
cached_value = self.cache.get_cache(key=cooldown_key) cached_value = self.cache.get_cache(key=cooldown_key)
@ -1929,6 +1960,7 @@ class Router:
) )
default_api_base = api_base default_api_base = api_base
default_api_key = api_key default_api_key = api_key
if ( if (
model_name in litellm.open_ai_chat_completion_models model_name in litellm.open_ai_chat_completion_models
or custom_llm_provider in litellm.openai_compatible_providers or custom_llm_provider in litellm.openai_compatible_providers
@ -1940,8 +1972,10 @@ class Router:
or "ft:gpt-3.5-turbo" in model_name or "ft:gpt-3.5-turbo" in model_name
or model_name in litellm.open_ai_embedding_models or model_name in litellm.open_ai_embedding_models
): ):
is_azure_ai_studio_model: bool = False
if custom_llm_provider == "azure": if custom_llm_provider == "azure":
if litellm.utils._is_non_openai_azure_model(model_name): if litellm.utils._is_non_openai_azure_model(model_name):
is_azure_ai_studio_model = True
custom_llm_provider = "openai" custom_llm_provider = "openai"
# remove azure prefx from model_name # remove azure prefx from model_name
model_name = model_name.replace("azure/", "") model_name = model_name.replace("azure/", "")
@ -1964,6 +1998,25 @@ class Router:
api_base = litellm.get_secret(api_base_env_name) api_base = litellm.get_secret(api_base_env_name)
litellm_params["api_base"] = api_base litellm_params["api_base"] = api_base
## AZURE AI STUDIO MISTRAL CHECK ##
"""
Make sure api base ends in /v1/
if not, add it - https://github.com/BerriAI/litellm/issues/2279
"""
if (
is_azure_ai_studio_model == True
and api_base is not None
and not api_base.endswith("/v1/")
):
# check if it ends with a trailing slash
if api_base.endswith("/"):
api_base += "v1/"
elif api_base.endswith("/v1"):
api_base += "/"
else:
api_base += "/v1/"
api_version = litellm_params.get("api_version") api_version = litellm_params.get("api_version")
if api_version and api_version.startswith("os.environ/"): if api_version and api_version.startswith("os.environ/"):
api_version_env_name = api_version.replace("os.environ/", "") api_version_env_name = api_version.replace("os.environ/", "")
@ -1986,7 +2039,9 @@ class Router:
stream_timeout = litellm.get_secret(stream_timeout_env_name) stream_timeout = litellm.get_secret(stream_timeout_env_name)
litellm_params["stream_timeout"] = stream_timeout litellm_params["stream_timeout"] = stream_timeout
max_retries = litellm_params.pop("max_retries", 2) max_retries = litellm_params.pop(
"max_retries", 0
) # router handles retry logic
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"): if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
max_retries_env_name = max_retries.replace("os.environ/", "") max_retries_env_name = max_retries.replace("os.environ/", "")
max_retries = litellm.get_secret(max_retries_env_name) max_retries = litellm.get_secret(max_retries_env_name)
@ -2052,9 +2107,11 @@ class Router:
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2074,9 +2131,11 @@ class Router:
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2096,9 +2155,11 @@ class Router:
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2118,9 +2179,11 @@ class Router:
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2158,9 +2221,11 @@ class Router:
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2178,9 +2243,11 @@ class Router:
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( verify=litellm.ssl_verify,
max_connections=1000, max_keepalive_connections=100 limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2199,9 +2266,11 @@ class Router:
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), ),
@ -2219,9 +2288,11 @@ class Router:
timeout=stream_timeout, timeout=stream_timeout,
max_retries=max_retries, max_retries=max_retries,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), ),
@ -2249,9 +2320,11 @@ class Router:
max_retries=max_retries, max_retries=max_retries,
organization=organization, organization=organization,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2271,9 +2344,11 @@ class Router:
max_retries=max_retries, max_retries=max_retries,
organization=organization, organization=organization,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2294,9 +2369,11 @@ class Router:
max_retries=max_retries, max_retries=max_retries,
organization=organization, organization=organization,
http_client=httpx.AsyncClient( http_client=httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), transport=AsyncCustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=async_proxy_mounts, mounts=async_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2317,9 +2394,11 @@ class Router:
max_retries=max_retries, max_retries=max_retries,
organization=organization, organization=organization,
http_client=httpx.Client( http_client=httpx.Client(
transport=CustomHTTPTransport(), transport=CustomHTTPTransport(
limits=httpx.Limits( limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100 max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), ),
mounts=sync_proxy_mounts, mounts=sync_proxy_mounts,
), # type: ignore ), # type: ignore
@ -2550,6 +2629,11 @@ class Router:
for var in vars_to_include: for var in vars_to_include:
if var in _all_vars: if var in _all_vars:
_settings_to_return[var] = _all_vars[var] _settings_to_return[var] = _all_vars[var]
if (
var == "routing_strategy_args"
and self.routing_strategy == "latency-based-routing"
):
_settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
return _settings_to_return return _settings_to_return
def update_settings(self, **kwargs): def update_settings(self, **kwargs):
@ -2581,6 +2665,13 @@ class Router:
_casted_value = int(kwargs[var]) _casted_value = int(kwargs[var])
setattr(self, var, _casted_value) setattr(self, var, _casted_value)
else: else:
if var == "routing_strategy":
self.routing_strategy_init(
routing_strategy=kwargs[var],
routing_strategy_args=kwargs.get(
"routing_strategy_args", {}
),
)
setattr(self, var, kwargs[var]) setattr(self, var, kwargs[var])
else: else:
verbose_router_logger.debug("Setting {} is not allowed".format(var)) verbose_router_logger.debug("Setting {} is not allowed".format(var))
@ -2717,7 +2808,10 @@ class Router:
self.cache.get_cache(key=model_id, local_only=True) or 0 self.cache.get_cache(key=model_id, local_only=True) or 0
) )
### get usage based cache ### ### get usage based cache ###
if isinstance(model_group_cache, dict): if (
isinstance(model_group_cache, dict)
and self.routing_strategy != "usage-based-routing-v2"
):
model_group_cache[model_id] = model_group_cache.get(model_id, 0) model_group_cache[model_id] = model_group_cache.get(model_id, 0)
current_request = max( current_request = max(
@ -2745,7 +2839,7 @@ class Router:
if _rate_limit_error == True: # allow generic fallback logic to take place if _rate_limit_error == True: # allow generic fallback logic to take place
raise ValueError( raise ValueError(
f"No deployments available for selected model, passed model={model}" f"{RouterErrors.no_deployments_available.value}, passed model={model}"
) )
elif _context_window_error == True: elif _context_window_error == True:
raise litellm.ContextWindowExceededError( raise litellm.ContextWindowExceededError(
@ -2883,6 +2977,11 @@ class Router:
model=model, healthy_deployments=healthy_deployments, messages=messages model=model, healthy_deployments=healthy_deployments, messages=messages
) )
if len(healthy_deployments) == 0:
raise ValueError(
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
)
if ( if (
self.routing_strategy == "usage-based-routing-v2" self.routing_strategy == "usage-based-routing-v2"
and self.lowesttpm_logger_v2 is not None and self.lowesttpm_logger_v2 is not None
@ -2938,7 +3037,7 @@ class Router:
f"get_available_deployment for model: {model}, No deployment available" f"get_available_deployment for model: {model}, No deployment available"
) )
raise ValueError( raise ValueError(
f"No deployments available for selected model, passed model={model}" f"{RouterErrors.no_deployments_available.value}, passed model={model}"
) )
verbose_router_logger.info( verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}" f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
@ -3068,7 +3167,7 @@ class Router:
f"get_available_deployment for model: {model}, No deployment available" f"get_available_deployment for model: {model}, No deployment available"
) )
raise ValueError( raise ValueError(
f"No deployments available for selected model, passed model={model}" f"{RouterErrors.no_deployments_available.value}, passed model={model}"
) )
verbose_router_logger.info( verbose_router_logger.info(
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}" f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"

View file

@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
import dotenv, os, requests, random import dotenv, os, requests, random
from typing import Optional, Union, List, Dict from typing import Optional, Union, List, Dict
from datetime import datetime, timedelta from datetime import datetime, timedelta
import random
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):
class RoutingArgs(LiteLLMBase): class RoutingArgs(LiteLLMBase):
ttl: int = 1 * 60 * 60 # 1 hour ttl: int = 1 * 60 * 60 # 1 hour
lowest_latency_buffer: float = 0
class LowestLatencyLoggingHandler(CustomLogger): class LowestLatencyLoggingHandler(CustomLogger):
@ -312,6 +314,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
except: except:
input_tokens = 0 input_tokens = 0
# randomly sample from all_deployments, incase all deployments have latency=0.0
_items = all_deployments.items()
all_deployments = random.sample(list(_items), len(_items))
all_deployments = dict(all_deployments)
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
potential_deployments = []
for item, item_map in all_deployments.items(): for item, item_map in all_deployments.items():
## get the item from model list ## get the item from model list
_deployment = None _deployment = None
@ -345,23 +355,48 @@ class LowestLatencyLoggingHandler(CustomLogger):
if isinstance(_call_latency, float): if isinstance(_call_latency, float):
total += _call_latency total += _call_latency
item_latency = total / len(item_latency) item_latency = total / len(item_latency)
if item_latency == 0:
deployment = _deployment # -------------- #
break # Debugging Logic
elif ( # -------------- #
# We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
# this helps a user to debug why the router picked a specfic deployment #
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
if _deployment_api_base is not None:
_latency_per_deployment[_deployment_api_base] = item_latency
# -------------- #
# End of Debugging Logic
# -------------- #
if (
item_tpm + input_tokens > _deployment_tpm item_tpm + input_tokens > _deployment_tpm
or item_rpm + 1 > _deployment_rpm or item_rpm + 1 > _deployment_rpm
): # if user passed in tpm / rpm in the model_list ): # if user passed in tpm / rpm in the model_list
continue continue
elif item_latency < lowest_latency: else:
lowest_latency = item_latency potential_deployments.append((_deployment, item_latency))
deployment = _deployment
if len(potential_deployments) == 0:
return None
# Sort potential deployments by latency
sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
# Find lowest latency deployment
lowest_latency = sorted_deployments[0][1]
# Find deployments within buffer of lowest latency
buffer = self.routing_args.lowest_latency_buffer * lowest_latency
valid_deployments = [
x for x in sorted_deployments if x[1] <= lowest_latency + buffer
]
# Pick a random deployment from valid deployments
random_valid_deployment = random.choice(valid_deployments)
deployment = random_valid_deployment[0]
# _latency_per_deployment is used for debuggig
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
_latency_per_deployment[_deployment_api_base] = item_latency
if request_kwargs is not None and "metadata" in request_kwargs: if request_kwargs is not None and "metadata" in request_kwargs:
request_kwargs["metadata"][ request_kwargs["metadata"][
"_latency_per_deployment" "_latency_per_deployment"

View file

@ -206,7 +206,7 @@ class LowestTPMLoggingHandler(CustomLogger):
if item_tpm + input_tokens > _deployment_tpm: if item_tpm + input_tokens > _deployment_tpm:
continue continue
elif (rpm_dict is not None and item in rpm_dict) and ( elif (rpm_dict is not None and item in rpm_dict) and (
rpm_dict[item] + 1 > _deployment_rpm rpm_dict[item] + 1 >= _deployment_rpm
): ):
continue continue
elif item_tpm < lowest_tpm: elif item_tpm < lowest_tpm:

View file

@ -333,7 +333,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
tpm_dict[tpm_key] = 0 tpm_dict[tpm_key] = 0
all_deployments = tpm_dict all_deployments = tpm_dict
deployment = None potential_deployments = [] # if multiple deployments have the same low value
for item, item_tpm in all_deployments.items(): for item, item_tpm in all_deployments.items():
## get the item from model list ## get the item from model list
_deployment = None _deployment = None
@ -343,6 +343,8 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
_deployment = m _deployment = m
if _deployment is None: if _deployment is None:
continue # skip to next one continue # skip to next one
elif item_tpm is None:
continue # skip if unhealthy deployment
_deployment_tpm = None _deployment_tpm = None
if _deployment_tpm is None: if _deployment_tpm is None:
@ -366,14 +368,20 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
if item_tpm + input_tokens > _deployment_tpm: if item_tpm + input_tokens > _deployment_tpm:
continue continue
elif (rpm_dict is not None and item in rpm_dict) and ( elif (rpm_dict is not None and item in rpm_dict) and (
rpm_dict[item] + 1 > _deployment_rpm rpm_dict[item] + 1 >= _deployment_rpm
): ):
continue continue
elif item_tpm == lowest_tpm:
potential_deployments.append(_deployment)
elif item_tpm < lowest_tpm: elif item_tpm < lowest_tpm:
lowest_tpm = item_tpm lowest_tpm = item_tpm
deployment = _deployment potential_deployments = [_deployment]
print_verbose("returning picked lowest tpm/rpm deployment.") print_verbose("returning picked lowest tpm/rpm deployment.")
return deployment
if len(potential_deployments) > 0:
return random.choice(potential_deployments)
else:
return None
async def async_get_available_deployments( async def async_get_available_deployments(
self, self,
@ -394,6 +402,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
dt = get_utc_datetime() dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M") current_minute = dt.strftime("%H-%M")
tpm_keys = [] tpm_keys = []
rpm_keys = [] rpm_keys = []
for m in healthy_deployments: for m in healthy_deployments:
@ -416,7 +425,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
tpm_values = combined_tpm_rpm_values[: len(tpm_keys)] tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
rpm_values = combined_tpm_rpm_values[len(tpm_keys) :] rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]
return self._common_checks_available_deployment( deployment = self._common_checks_available_deployment(
model_group=model_group, model_group=model_group,
healthy_deployments=healthy_deployments, healthy_deployments=healthy_deployments,
tpm_keys=tpm_keys, tpm_keys=tpm_keys,
@ -427,6 +436,61 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
input=input, input=input,
) )
try:
assert deployment is not None
return deployment
except Exception as e:
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
deployment_dict = {}
for index, _deployment in enumerate(healthy_deployments):
if isinstance(_deployment, dict):
id = _deployment.get("model_info", {}).get("id")
### GET DEPLOYMENT TPM LIMIT ###
_deployment_tpm = None
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("tpm", None)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("litellm_params", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("model_info", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = float("inf")
### GET CURRENT TPM ###
current_tpm = tpm_values[index]
### GET DEPLOYMENT TPM LIMIT ###
_deployment_rpm = None
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("rpm", None)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("litellm_params", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("model_info", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = float("inf")
### GET CURRENT RPM ###
current_rpm = rpm_values[index]
deployment_dict[id] = {
"current_tpm": current_tpm,
"tpm_limit": _deployment_tpm,
"current_rpm": current_rpm,
"rpm_limit": _deployment_rpm,
}
raise ValueError(
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
)
def get_available_deployments( def get_available_deployments(
self, self,
model_group: str, model_group: str,
@ -464,7 +528,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
keys=rpm_keys keys=rpm_keys
) # [1, 2, None, ..] ) # [1, 2, None, ..]
return self._common_checks_available_deployment( deployment = self._common_checks_available_deployment(
model_group=model_group, model_group=model_group,
healthy_deployments=healthy_deployments, healthy_deployments=healthy_deployments,
tpm_keys=tpm_keys, tpm_keys=tpm_keys,
@ -474,3 +538,58 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
messages=messages, messages=messages,
input=input, input=input,
) )
try:
assert deployment is not None
return deployment
except Exception as e:
### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
deployment_dict = {}
for index, _deployment in enumerate(healthy_deployments):
if isinstance(_deployment, dict):
id = _deployment.get("model_info", {}).get("id")
### GET DEPLOYMENT TPM LIMIT ###
_deployment_tpm = None
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("tpm", None)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("litellm_params", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = _deployment.get("model_info", {}).get(
"tpm", None
)
if _deployment_tpm is None:
_deployment_tpm = float("inf")
### GET CURRENT TPM ###
current_tpm = tpm_values[index]
### GET DEPLOYMENT TPM LIMIT ###
_deployment_rpm = None
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("rpm", None)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("litellm_params", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = _deployment.get("model_info", {}).get(
"rpm", None
)
if _deployment_rpm is None:
_deployment_rpm = float("inf")
### GET CURRENT RPM ###
current_rpm = rpm_values[index]
deployment_dict[id] = {
"current_tpm": current_tpm,
"tpm_limit": _deployment_tpm,
"current_rpm": current_rpm,
"rpm_limit": _deployment_rpm,
}
raise ValueError(
f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
)

View file

@ -19,6 +19,7 @@ def setup_and_teardown():
0, os.path.abspath("../..") 0, os.path.abspath("../..")
) # Adds the project directory to the system path ) # Adds the project directory to the system path
import litellm import litellm
from litellm import Router
importlib.reload(litellm) importlib.reload(litellm)
import asyncio import asyncio

View file

@ -119,7 +119,9 @@ def test_multiple_deployments_parallel():
# test_multiple_deployments_parallel() # test_multiple_deployments_parallel()
def test_cooldown_same_model_name(): @pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_cooldown_same_model_name(sync_mode):
# users could have the same model with different api_base # users could have the same model with different api_base
# example # example
# azure/chatgpt, api_base: 1234 # azure/chatgpt, api_base: 1234
@ -161,22 +163,40 @@ def test_cooldown_same_model_name():
num_retries=3, num_retries=3,
) # type: ignore ) # type: ignore
response = router.completion( if sync_mode:
model="gpt-3.5-turbo", response = router.completion(
messages=[{"role": "user", "content": "hello this request will pass"}], model="gpt-3.5-turbo",
) messages=[{"role": "user", "content": "hello this request will pass"}],
print(router.model_list) )
model_ids = [] print(router.model_list)
for model in router.model_list: model_ids = []
model_ids.append(model["model_info"]["id"]) for model in router.model_list:
print("\n litellm model ids ", model_ids) model_ids.append(model["model_info"]["id"])
print("\n litellm model ids ", model_ids)
# example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960'] # example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
assert ( assert (
model_ids[0] != model_ids[1] model_ids[0] != model_ids[1]
) # ensure both models have a uuid added, and they have different names ) # ensure both models have a uuid added, and they have different names
print("\ngot response\n", response) print("\ngot response\n", response)
else:
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hello this request will pass"}],
)
print(router.model_list)
model_ids = []
for model in router.model_list:
model_ids.append(model["model_info"]["id"])
print("\n litellm model ids ", model_ids)
# example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
assert (
model_ids[0] != model_ids[1]
) # ensure both models have a uuid added, and they have different names
print("\ngot response\n", response)
except Exception as e: except Exception as e:
pytest.fail(f"Got unexpected exception on router! - {e}") pytest.fail(f"Got unexpected exception on router! - {e}")

View file

@ -161,40 +161,56 @@ async def make_async_calls():
return total_time return total_time
# def test_langfuse_logging_async_text_completion(): @pytest.mark.asyncio
# try: @pytest.mark.parametrize("stream", [False, True])
# pre_langfuse_setup() async def test_langfuse_logging_without_request_response(stream):
# litellm.set_verbose = False try:
# litellm.success_callback = ["langfuse"] import uuid
# async def _test_langfuse(): _unique_trace_name = f"litellm-test-{str(uuid.uuid4())}"
# response = await litellm.atext_completion( litellm.set_verbose = True
# model="gpt-3.5-turbo-instruct", litellm.turn_off_message_logging = True
# prompt="this is a test", litellm.success_callback = ["langfuse"]
# max_tokens=5, response = await litellm.acompletion(
# temperature=0.7, model="gpt-3.5-turbo",
# timeout=5, mock_response="It's simple to use and easy to get started",
# user="test_user", messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
# stream=True max_tokens=10,
# ) temperature=0.2,
# async for chunk in response: stream=stream,
# print() metadata={"trace_id": _unique_trace_name},
# print(chunk) )
# await asyncio.sleep(1) print(response)
# return response if stream:
async for chunk in response:
print(chunk)
# response = asyncio.run(_test_langfuse()) await asyncio.sleep(3)
# print(f"response: {response}")
# # # check langfuse.log to see if there was a failed response import langfuse
# search_logs("langfuse.log")
# except litellm.Timeout as e:
# pass
# except Exception as e:
# pytest.fail(f"An exception occurred - {e}")
langfuse_client = langfuse.Langfuse(
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
)
# test_langfuse_logging_async_text_completion() # get trace with _unique_trace_name
trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
print("trace_from_langfuse", trace)
_trace_data = trace.data
assert _trace_data[0].input == {"messages": "redacted-by-litellm"}
assert _trace_data[0].output == {
"role": "assistant",
"content": "redacted-by-litellm",
"function_call": None,
"tool_calls": None,
}
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
@pytest.mark.skip(reason="beta test - checking langfuse output") @pytest.mark.skip(reason="beta test - checking langfuse output")
@ -334,6 +350,220 @@ def test_langfuse_logging_function_calling():
# test_langfuse_logging_function_calling() # test_langfuse_logging_function_calling()
def test_langfuse_existing_trace_id():
"""
When existing trace id is passed, don't set trace params -> prevents overwriting the trace
Pass 1 logging object with a trace
Pass 2nd logging object with the trace id
Assert no changes to the trace
"""
# Test - if the logs were sent to the correct team on langfuse
import litellm, datetime
from litellm.integrations.langfuse import LangFuseLogger
langfuse_Logger = LangFuseLogger(
langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
langfuse_secret=os.getenv("LANGFUSE_PROJECT2_SECRET"),
)
litellm.success_callback = ["langfuse"]
# langfuse_args = {'kwargs': { 'start_time': 'end_time': datetime.datetime(2024, 5, 1, 7, 31, 29, 903685), 'user_id': None, 'print_verbose': <function print_verbose at 0x109d1f420>, 'level': 'DEFAULT', 'status_message': None}
response_obj = litellm.ModelResponse(
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
choices=[
litellm.Choices(
finish_reason="stop",
index=0,
message=litellm.Message(
content="I'm sorry, I am an AI assistant and do not have real-time information. I recommend checking a reliable weather website or app for the most up-to-date weather information in Boston.",
role="assistant",
),
)
],
created=1714573888,
model="gpt-3.5-turbo-0125",
object="chat.completion",
system_fingerprint="fp_3b956da36b",
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
)
### NEW TRACE ###
message = [{"role": "user", "content": "what's the weather in boston"}]
langfuse_args = {
"response_obj": response_obj,
"kwargs": {
"model": "gpt-3.5-turbo",
"litellm_params": {
"acompletion": False,
"api_key": None,
"force_timeout": 600,
"logger_fn": None,
"verbose": False,
"custom_llm_provider": "openai",
"api_base": "https://api.openai.com/v1/",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"model_alias_map": {},
"completion_call_id": None,
"metadata": None,
"model_info": None,
"proxy_server_request": None,
"preset_cache_key": None,
"no-log": False,
"stream_response": {},
},
"messages": message,
"optional_params": {"temperature": 0.1, "extra_body": {}},
"start_time": "2024-05-01 07:31:27.986164",
"stream": False,
"user": None,
"call_type": "completion",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"completion_start_time": "2024-05-01 07:31:29.903685",
"temperature": 0.1,
"extra_body": {},
"input": [{"role": "user", "content": "what's the weather in boston"}],
"api_key": "my-api-key",
"additional_args": {
"complete_input_dict": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "what's the weather in boston"}
],
"temperature": 0.1,
"extra_body": {},
}
},
"log_event_type": "successful_api_call",
"end_time": "2024-05-01 07:31:29.903685",
"cache_hit": None,
"response_cost": 6.25e-05,
},
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
"user_id": None,
"print_verbose": litellm.print_verbose,
"level": "DEFAULT",
"status_message": None,
}
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
import langfuse
langfuse_client = langfuse.Langfuse(
public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
)
trace_id = langfuse_response_object["trace_id"]
langfuse_client.flush()
time.sleep(2)
print(langfuse_client.get_trace(id=trace_id))
initial_langfuse_trace = langfuse_client.get_trace(id=trace_id)
### EXISTING TRACE ###
new_metadata = {"existing_trace_id": trace_id}
new_messages = [{"role": "user", "content": "What do you know?"}]
new_response_obj = litellm.ModelResponse(
id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
choices=[
litellm.Choices(
finish_reason="stop",
index=0,
message=litellm.Message(
content="What do I know?",
role="assistant",
),
)
],
created=1714573888,
model="gpt-3.5-turbo-0125",
object="chat.completion",
system_fingerprint="fp_3b956da36b",
usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
)
langfuse_args = {
"response_obj": new_response_obj,
"kwargs": {
"model": "gpt-3.5-turbo",
"litellm_params": {
"acompletion": False,
"api_key": None,
"force_timeout": 600,
"logger_fn": None,
"verbose": False,
"custom_llm_provider": "openai",
"api_base": "https://api.openai.com/v1/",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"model_alias_map": {},
"completion_call_id": None,
"metadata": new_metadata,
"model_info": None,
"proxy_server_request": None,
"preset_cache_key": None,
"no-log": False,
"stream_response": {},
},
"messages": new_messages,
"optional_params": {"temperature": 0.1, "extra_body": {}},
"start_time": "2024-05-01 07:31:27.986164",
"stream": False,
"user": None,
"call_type": "completion",
"litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
"completion_start_time": "2024-05-01 07:31:29.903685",
"temperature": 0.1,
"extra_body": {},
"input": [{"role": "user", "content": "what's the weather in boston"}],
"api_key": "my-api-key",
"additional_args": {
"complete_input_dict": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "what's the weather in boston"}
],
"temperature": 0.1,
"extra_body": {},
}
},
"log_event_type": "successful_api_call",
"end_time": "2024-05-01 07:31:29.903685",
"cache_hit": None,
"response_cost": 6.25e-05,
},
"start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
"end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
"user_id": None,
"print_verbose": litellm.print_verbose,
"level": "DEFAULT",
"status_message": None,
}
langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
new_trace_id = langfuse_response_object["trace_id"]
assert new_trace_id == trace_id
langfuse_client.flush()
time.sleep(2)
print(langfuse_client.get_trace(id=trace_id))
new_langfuse_trace = langfuse_client.get_trace(id=trace_id)
assert dict(initial_langfuse_trace) == dict(new_langfuse_trace)
def test_langfuse_logging_tool_calling(): def test_langfuse_logging_tool_calling():
litellm.set_verbose = True litellm.set_verbose = True

View file

@ -68,6 +68,7 @@ async def test_get_api_base():
await _pl.alerting_handler( await _pl.alerting_handler(
message=slow_message + request_info, message=slow_message + request_info,
level="Low", level="Low",
alert_type="llm_too_slow",
) )
print("passed test_get_api_base") print("passed test_get_api_base")

View file

@ -394,6 +394,8 @@ async def test_async_vertexai_response():
pass pass
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
except litellm.APIError as e:
pass
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred: {e}") pytest.fail(f"An exception occurred: {e}")
@ -636,7 +638,10 @@ def test_gemini_pro_function_calling():
# gemini_pro_function_calling() # gemini_pro_function_calling()
def test_gemini_pro_function_calling_streaming(): @pytest.mark.parametrize("stream", [False, True])
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
async def test_gemini_pro_function_calling_streaming(stream, sync_mode):
load_vertex_ai_credentials() load_vertex_ai_credentials()
litellm.set_verbose = True litellm.set_verbose = True
tools = [ tools = [
@ -665,19 +670,41 @@ def test_gemini_pro_function_calling_streaming():
"content": "What's the weather like in Boston today in fahrenheit?", "content": "What's the weather like in Boston today in fahrenheit?",
} }
] ]
optional_params = {
"tools": tools,
"tool_choice": "auto",
"n": 1,
"stream": stream,
"temperature": 0.1,
}
try: try:
completion = litellm.completion( if sync_mode == True:
model="gemini-pro", response = litellm.completion(
messages=messages, model="gemini-pro", messages=messages, **optional_params
tools=tools, )
tool_choice="auto", print(f"completion: {response}")
stream=True,
) if stream == True:
print(f"completion: {completion}") # assert completion.choices[0].message.content is None
# assert completion.choices[0].message.content is None # assert len(completion.choices[0].message.tool_calls) == 1
# assert len(completion.choices[0].message.tool_calls) == 1 for chunk in response:
for chunk in completion: assert isinstance(chunk, litellm.ModelResponse)
print(f"chunk: {chunk}") else:
assert isinstance(response, litellm.ModelResponse)
else:
response = await litellm.acompletion(
model="gemini-pro", messages=messages, **optional_params
)
print(f"completion: {response}")
if stream == True:
# assert completion.choices[0].message.content is None
# assert len(completion.choices[0].message.tool_calls) == 1
async for chunk in response:
print(f"chunk: {chunk}")
assert isinstance(chunk, litellm.ModelResponse)
else:
assert isinstance(response, litellm.ModelResponse)
except litellm.APIError as e: except litellm.APIError as e:
pass pass
except litellm.RateLimitError as e: except litellm.RateLimitError as e:

View file

@ -57,7 +57,7 @@ def test_completion_custom_provider_model_name():
messages=messages, messages=messages,
logger_fn=logger_fn, logger_fn=logger_fn,
) )
# Add any assertions here to, check the response # Add any assertions here to,check the response
print(response) print(response)
print(response["choices"][0]["finish_reason"]) print(response["choices"][0]["finish_reason"])
except litellm.Timeout as e: except litellm.Timeout as e:
@ -231,6 +231,76 @@ def test_completion_claude_3_function_call():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_cohere_command_r_plus_function_call():
litellm.set_verbose = True
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
}
]
try:
# test without max tokens
response = completion(
model="command-r-plus",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
messages.append(
response.choices[0].message.model_dump()
) # Add assistant tool invokes
tool_result = (
'{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
)
# Add user submitted tool results in the OpenAI format
messages.append(
{
"tool_call_id": response.choices[0].message.tool_calls[0].id,
"role": "tool",
"name": response.choices[0].message.tool_calls[0].function.name,
"content": tool_result,
}
)
# In the second response, Cohere should deduce answer from tool results
second_response = completion(
model="command-r-plus",
messages=messages,
tools=tools,
tool_choice="auto",
)
print(second_response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_parse_xml_params(): def test_parse_xml_params():
from litellm.llms.prompt_templates.factory import parse_xml_params from litellm.llms.prompt_templates.factory import parse_xml_params
@ -1291,6 +1361,7 @@ def test_completion_logprobs_stream():
for chunk in response: for chunk in response:
# check if atleast one chunk has log probs # check if atleast one chunk has log probs
print(chunk) print(chunk)
print(f"chunk.choices[0]: {chunk.choices[0]}")
if "logprobs" in chunk.choices[0]: if "logprobs" in chunk.choices[0]:
# assert we got a valid logprob in the choices # assert we got a valid logprob in the choices
assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3 assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
@ -1781,7 +1852,6 @@ def test_completion_replicate_llama3():
print("RESPONSE STRING\n", response_str) print("RESPONSE STRING\n", response_str)
if type(response_str) != str: if type(response_str) != str:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
raise Exception("it worked!")
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -2655,6 +2725,88 @@ def test_completion_palm_stream():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_watsonx():
litellm.set_verbose = True
model_name = "watsonx/ibm/granite-13b-chat-v2"
try:
response = completion(
model=model_name,
messages=messages,
stop=["stop"],
max_tokens=20,
)
# Add any assertions here to check the response
print(response)
except litellm.APIError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
"provider, model, project, region_name, token",
[
("azure", "chatgpt-v-2", None, None, "test-token"),
("vertex_ai", "anthropic-claude-3", "adroit-crow-1", "us-east1", None),
("watsonx", "ibm/granite", "96946574", "dallas", "1234"),
("bedrock", "anthropic.claude-3", None, "us-east-1", None),
],
)
def test_unified_auth_params(provider, model, project, region_name, token):
"""
Check if params = ["project", "region_name", "token"]
are correctly translated for = ["azure", "vertex_ai", "watsonx", "aws"]
tests get_optional_params
"""
data = {
"project": project,
"region_name": region_name,
"token": token,
"custom_llm_provider": provider,
"model": model,
}
translated_optional_params = litellm.utils.get_optional_params(**data)
if provider == "azure":
special_auth_params = (
litellm.AzureOpenAIConfig().get_mapped_special_auth_params()
)
elif provider == "bedrock":
special_auth_params = (
litellm.AmazonBedrockGlobalConfig().get_mapped_special_auth_params()
)
elif provider == "vertex_ai":
special_auth_params = litellm.VertexAIConfig().get_mapped_special_auth_params()
elif provider == "watsonx":
special_auth_params = (
litellm.IBMWatsonXAIConfig().get_mapped_special_auth_params()
)
for param, value in special_auth_params.items():
assert param in data
assert value in translated_optional_params
@pytest.mark.asyncio
async def test_acompletion_watsonx():
litellm.set_verbose = True
model_name = "watsonx/ibm/granite-13b-chat-v2"
print("testing watsonx")
try:
response = await litellm.acompletion(
model=model_name,
messages=messages,
temperature=0.2,
max_tokens=80,
)
# Add any assertions here to check the response
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_palm_stream() # test_completion_palm_stream()
# test_completion_deep_infra() # test_completion_deep_infra()

View file

@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
completion_response=response, call_type="image_generation" completion_response=response, call_type="image_generation"
) )
assert cost > 0 assert cost > 0
def test_replicate_llama3_cost_tracking():
litellm.set_verbose = True
model = "replicate/meta/meta-llama-3-8b-instruct"
litellm.register_model(
{
"replicate/meta/meta-llama-3-8b-instruct": {
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
}
}
)
response = litellm.ModelResponse(
id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
choices=[
litellm.utils.Choices(
finish_reason="stop",
index=0,
message=litellm.utils.Message(
content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
role="assistant",
),
)
],
created=1714401369,
model="replicate/meta/meta-llama-3-8b-instruct",
object="chat.completion",
system_fingerprint=None,
usage=litellm.utils.Usage(
prompt_tokens=48, completion_tokens=31, total_tokens=79
),
)
cost = litellm.completion_cost(
completion_response=response,
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
print(f"cost: {cost}")
cost = round(cost, 5)
expected_cost = round(
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"input_cost_per_token"
]
* 48
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"output_cost_per_token"
]
* 31,
5,
)
assert cost == expected_cost

View file

@ -26,6 +26,9 @@ class DBModel(BaseModel):
model_info: dict model_info: dict
litellm_params: dict litellm_params: dict
class Config:
protected_namespaces = ()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_deployment(): async def test_delete_deployment():

View file

@ -529,6 +529,7 @@ def test_chat_bedrock_stream():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_chat_bedrock_stream(): async def test_async_chat_bedrock_stream():
try: try:
litellm.set_verbose = True
customHandler = CompletionCustomHandler() customHandler = CompletionCustomHandler()
litellm.callbacks = [customHandler] litellm.callbacks = [customHandler]
response = await litellm.acompletion( response = await litellm.acompletion(

View file

@ -484,6 +484,20 @@ def test_mistral_embeddings():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="local test")
def test_watsonx_embeddings():
try:
litellm.set_verbose = True
response = litellm.embedding(
model="watsonx/ibm/slate-30m-english-rtrvr",
input=["good morning from litellm"],
)
print(f"response: {response}")
assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_mistral_embeddings() # test_mistral_embeddings()

View file

@ -25,7 +25,7 @@ def test_empty_content():
pass pass
function_setup( function_setup(
original_function=completion, original_function="completion",
rules_obj=rules_obj, rules_obj=rules_obj,
start_time=datetime.now(), start_time=datetime.now(),
messages=[], messages=[],

View file

@ -136,8 +136,8 @@ def test_image_generation_bedrock():
litellm.set_verbose = True litellm.set_verbose = True
response = litellm.image_generation( response = litellm.image_generation(
prompt="A cute baby sea otter", prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0", model="bedrock/stability.stable-diffusion-xl-v1",
aws_region_name="us-east-1", aws_region_name="us-west-2",
) )
print(f"response: {response}") print(f"response: {response}")
except litellm.RateLimitError as e: except litellm.RateLimitError as e:
@ -156,8 +156,8 @@ async def test_aimage_generation_bedrock_with_optional_params():
try: try:
response = await litellm.aimage_generation( response = await litellm.aimage_generation(
prompt="A cute baby sea otter", prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0", model="bedrock/stability.stable-diffusion-xl-v1",
size="128x128", size="256x256",
) )
print(f"response: {response}") print(f"response: {response}")
except litellm.RateLimitError as e: except litellm.RateLimitError as e:

View file

@ -201,6 +201,7 @@ async def test_router_atext_completion_streaming():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_router_completion_streaming(): async def test_router_completion_streaming():
litellm.set_verbose = True
messages = [ messages = [
{"role": "user", "content": "Hello, can you generate a 500 words poem?"} {"role": "user", "content": "Hello, can you generate a 500 words poem?"}
] ]
@ -219,9 +220,9 @@ async def test_router_completion_streaming():
{ {
"model_name": "azure-model", "model_name": "azure-model",
"litellm_params": { "litellm_params": {
"model": "azure/gpt-35-turbo", "model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY", "api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com", "api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 6, "rpm": 6,
}, },
"model_info": {"id": 2}, "model_info": {"id": 2},
@ -229,9 +230,9 @@ async def test_router_completion_streaming():
{ {
"model_name": "azure-model", "model_name": "azure-model",
"litellm_params": { "litellm_params": {
"model": "azure/gpt-35-turbo", "model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_CANADA_API_KEY", "api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://my-endpoint-canada-berri992.openai.azure.com", "api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 6, "rpm": 6,
}, },
"model_info": {"id": 3}, "model_info": {"id": 3},
@ -262,4 +263,4 @@ async def test_router_completion_streaming():
## check if calls equally distributed ## check if calls equally distributed
cache_dict = router.cache.get_cache(key=cache_key) cache_dict = router.cache.get_cache(key=cache_key)
for k, v in cache_dict.items(): for k, v in cache_dict.items():
assert v == 1 assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"

View file

@ -555,3 +555,171 @@ async def test_lowest_latency_routing_with_timeouts():
# ALL the Requests should have been routed to the fast-endpoint # ALL the Requests should have been routed to the fast-endpoint
assert deployments["fast-endpoint"] == 10 assert deployments["fast-endpoint"] == 10
@pytest.mark.asyncio
async def test_lowest_latency_routing_first_pick():
"""
PROD Test:
- When all deployments are latency=0, it should randomly pick a deployment
- IT SHOULD NEVER PICK THE Very First deployment everytime all deployment latencies are 0
- This ensures that after the ttl window resets it randomly picks a deployment
"""
import litellm
litellm.set_verbose = True
router = Router(
model_list=[
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-2"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-3"},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "openai/fast-endpoint-2",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"api_key": "fake-key",
},
"model_info": {"id": "fast-endpoint-4"},
},
],
routing_strategy="latency-based-routing",
routing_strategy_args={"ttl": 0.0000000001},
set_verbose=True,
debug_level="DEBUG",
) # type: ignore
deployments = {}
for _ in range(5):
response = await router.acompletion(
model="azure-model", messages=[{"role": "user", "content": "hello"}]
)
print(response)
_picked_model_id = response._hidden_params["model_id"]
if _picked_model_id not in deployments:
deployments[_picked_model_id] = 1
else:
deployments[_picked_model_id] += 1
await asyncio.sleep(0.000000000005)
print("deployments", deployments)
# assert that len(deployments) >1
assert len(deployments) > 1
@pytest.mark.parametrize("buffer", [0, 1])
@pytest.mark.asyncio
async def test_lowest_latency_routing_buffer(buffer):
"""
Allow shuffling calls within a certain latency buffer
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"rpm": 1440,
},
"model_info": {"id": 1},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-35-turbo",
"api_key": "os.environ/AZURE_EUROPE_API_KEY",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
"rpm": 6,
},
"model_info": {"id": 2},
},
]
router = Router(
model_list=model_list,
routing_strategy="latency-based-routing",
set_verbose=False,
num_retries=3,
routing_strategy_args={"lowest_latency_buffer": buffer},
) # type: ignore
## DEPLOYMENT 1 ##
deployment_id = 1
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 1},
}
}
start_time = time.time()
response_obj = {"usage": {"total_tokens": 50}}
time.sleep(3)
end_time = time.time()
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## DEPLOYMENT 2 ##
deployment_id = 2
kwargs = {
"litellm_params": {
"metadata": {
"model_group": "azure-model",
},
"model_info": {"id": 2},
}
}
start_time = time.time()
response_obj = {"usage": {"total_tokens": 20}}
time.sleep(2)
end_time = time.time()
router.lowestlatency_logger.log_success_event(
response_obj=response_obj,
kwargs=kwargs,
start_time=start_time,
end_time=end_time,
)
## CHECK WHAT'S SELECTED ##
# print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
selected_deployments = {}
for _ in range(50):
print(router.get_available_deployment(model="azure-model"))
selected_deployments[
router.get_available_deployment(model="azure-model")["model_info"]["id"]
] = 1
if buffer == 0:
assert len(selected_deployments.keys()) == 1
else:
assert len(selected_deployments.keys()) == 2

View file

@ -0,0 +1,10 @@
import warnings
import pytest
def test_namespace_conflict_warning():
with warnings.catch_warnings(record=True) as recorded_warnings:
warnings.simplefilter("always") # Capture all warnings
import litellm
# Check that no warning with the specific message was raised
assert not any("conflict with protected namespace" in str(w.message) for w in recorded_warnings), "Test failed: 'conflict with protected namespace' warning was encountered!"

View file

@ -1,7 +1,7 @@
#### What this tests #### #### What this tests ####
# This tests litellm router # This tests litellm router
import sys, os, time import sys, os, time, openai
import traceback, asyncio import traceback, asyncio
import pytest import pytest
@ -14,10 +14,169 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict from collections import defaultdict
from dotenv import load_dotenv from dotenv import load_dotenv
import os, httpx
load_dotenv() load_dotenv()
@pytest.mark.parametrize("num_retries", [None, 2])
@pytest.mark.parametrize("max_retries", [None, 4])
def test_router_num_retries_init(num_retries, max_retries):
"""
- test when num_retries set v/s not
- test client value when max retries set v/s not
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"max_retries": max_retries,
},
"model_info": {"id": 12345},
},
],
num_retries=num_retries,
)
if num_retries is not None:
assert router.num_retries == num_retries
else:
assert router.num_retries == openai.DEFAULT_MAX_RETRIES
model_client = router._get_client(
{"model_info": {"id": 12345}}, client_type="async", kwargs={}
)
if max_retries is not None:
assert getattr(model_client, "max_retries") == max_retries
else:
assert getattr(model_client, "max_retries") == 0
@pytest.mark.parametrize(
"timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
)
@pytest.mark.parametrize("ssl_verify", [True, False])
def test_router_timeout_init(timeout, ssl_verify):
"""
Allow user to pass httpx.Timeout
related issue - https://github.com/BerriAI/litellm/issues/3162
"""
litellm.ssl_verify = ssl_verify
router = Router(
model_list=[
{
"model_name": "test-model",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
"timeout": timeout,
},
"model_info": {"id": 1234},
}
]
)
model_client = router._get_client(
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
)
assert getattr(model_client, "timeout") == timeout
print(f"vars model_client: {vars(model_client)}")
http_client = getattr(model_client, "_client")
print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
if ssl_verify == False:
assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
else:
assert (
http_client._transport._pool._ssl_context.verify_mode.name
== "CERT_REQUIRED"
)
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
async def test_router_retries(sync_mode):
"""
- make sure retries work as expected
"""
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad-key"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
},
},
]
router = Router(model_list=model_list, num_retries=2)
if sync_mode:
router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
else:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
@pytest.mark.parametrize(
"mistral_api_base",
[
"os.environ/AZURE_MISTRAL_API_BASE",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/",
"https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
],
)
def test_router_azure_ai_studio_init(mistral_api_base):
router = Router(
model_list=[
{
"model_name": "test-model",
"litellm_params": {
"model": "azure/mistral-large-latest",
"api_key": "os.environ/AZURE_MISTRAL_API_KEY",
"api_base": mistral_api_base,
},
"model_info": {"id": 1234},
}
]
)
model_client = router._get_client(
deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
)
url = getattr(model_client, "_base_url")
uri_reference = str(getattr(url, "_uri_reference"))
print(f"uri_reference: {uri_reference}")
assert "/v1/" in uri_reference
assert uri_reference.count("v1") == 1
def test_exception_raising(): def test_exception_raising():
# this tests if the router raises an exception when invalid params are set # this tests if the router raises an exception when invalid params are set
# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception # in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
@ -995,6 +1154,7 @@ def test_consistent_model_id():
assert id1 == id2 assert id1 == id2
@pytest.mark.skip(reason="local test")
def test_reading_keys_os_environ(): def test_reading_keys_os_environ():
import openai import openai
@ -1094,6 +1254,7 @@ def test_reading_keys_os_environ():
# test_reading_keys_os_environ() # test_reading_keys_os_environ()
@pytest.mark.skip(reason="local test")
def test_reading_openai_keys_os_environ(): def test_reading_openai_keys_os_environ():
import openai import openai

View file

@ -46,6 +46,7 @@ def test_async_fallbacks(caplog):
router = Router( router = Router(
model_list=model_list, model_list=model_list,
fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}], fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
num_retries=1,
) )
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
@ -81,8 +82,8 @@ def test_async_fallbacks(caplog):
# Define the expected log messages # Define the expected log messages
# - error request, falling back notice, success notice # - error request, falling back notice, success notice
expected_logs = [ expected_logs = [
"Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m", "litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
"litellm.acompletion(model=None)\x1b[31m Exception No deployments available for selected model, passed model=gpt-3.5-turbo\x1b[0m",
"Falling back to model_group = azure/gpt-3.5-turbo", "Falling back to model_group = azure/gpt-3.5-turbo",
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m", "litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
] ]

View file

@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger):
def log_pre_api_call(self, model, messages, kwargs): def log_pre_api_call(self, model, messages, kwargs):
print(f"Pre-API Call") print(f"Pre-API Call")
print( print(
f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}" f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
) )
self.previous_models += len( self.previous_models = len(
kwargs["litellm_params"]["metadata"]["previous_models"] kwargs["litellm_params"]["metadata"].get("previous_models", [])
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]} ) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
print(f"self.previous_models: {self.previous_models}") print(f"self.previous_models: {self.previous_models}")
@ -127,7 +127,7 @@ def test_sync_fallbacks():
response = router.completion(**kwargs) response = router.completion(**kwargs)
print(f"response: {response}") print(f"response: {response}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4
print("Passed ! Test router_fallbacks: test_sync_fallbacks()") print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
router.reset() router.reset()
@ -140,7 +140,7 @@ def test_sync_fallbacks():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_async_fallbacks(): async def test_async_fallbacks():
litellm.set_verbose = False litellm.set_verbose = True
model_list = [ model_list = [
{ # list of model deployments { # list of model deployments
"model_name": "azure/gpt-3.5-turbo", # openai model name "model_name": "azure/gpt-3.5-turbo", # openai model name
@ -209,12 +209,13 @@ async def test_async_fallbacks():
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}] messages = [{"content": user_message, "role": "user"}]
try: try:
kwargs["model"] = "azure/gpt-3.5-turbo"
response = await router.acompletion(**kwargs) response = await router.acompletion(**kwargs)
print(f"customHandler.previous_models: {customHandler.previous_models}") print(f"customHandler.previous_models: {customHandler.previous_models}")
await asyncio.sleep( await asyncio.sleep(
0.05 0.05
) # allow a delay as success_callbacks are on a separate thread ) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
@ -268,7 +269,7 @@ def test_sync_fallbacks_embeddings():
response = router.embedding(**kwargs) response = router.embedding(**kwargs)
print(f"customHandler.previous_models: {customHandler.previous_models}") print(f"customHandler.previous_models: {customHandler.previous_models}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
@ -322,7 +323,7 @@ async def test_async_fallbacks_embeddings():
await asyncio.sleep( await asyncio.sleep(
0.05 0.05
) # allow a delay as success_callbacks are on a separate thread ) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
@ -401,7 +402,7 @@ def test_dynamic_fallbacks_sync():
response = router.completion(**kwargs) response = router.completion(**kwargs)
print(f"response: {response}") print(f"response: {response}")
time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread time.sleep(0.05) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {e}") pytest.fail(f"An exception occurred - {e}")
@ -487,7 +488,7 @@ async def test_dynamic_fallbacks_async():
await asyncio.sleep( await asyncio.sleep(
0.05 0.05
) # allow a delay as success_callbacks are on a separate thread ) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred - {e}") pytest.fail(f"An exception occurred - {e}")
@ -572,7 +573,7 @@ async def test_async_fallbacks_streaming():
await asyncio.sleep( await asyncio.sleep(
0.05 0.05
) # allow a delay as success_callbacks are on a separate thread ) # allow a delay as success_callbacks are on a separate thread
assert customHandler.previous_models == 1 # 0 retries, 1 fallback assert customHandler.previous_models == 4 # 1 init call, 2 retries, 1 fallback
router.reset() router.reset()
except litellm.Timeout as e: except litellm.Timeout as e:
pass pass
@ -751,7 +752,7 @@ async def test_async_fallbacks_max_retries_per_request():
router.reset() router.reset()
def test_usage_based_routing_fallbacks(): def test_ausage_based_routing_fallbacks():
try: try:
# [Prod Test] # [Prod Test]
# IT tests Usage Based Routing with fallbacks # IT tests Usage Based Routing with fallbacks
@ -765,10 +766,10 @@ def test_usage_based_routing_fallbacks():
load_dotenv() load_dotenv()
# Constants for TPM and RPM allocation # Constants for TPM and RPM allocation
AZURE_FAST_TPM = 3 AZURE_FAST_RPM = 1
AZURE_BASIC_TPM = 4 AZURE_BASIC_RPM = 1
OPENAI_TPM = 400 OPENAI_RPM = 2
ANTHROPIC_TPM = 100000 ANTHROPIC_RPM = 100000
def get_azure_params(deployment_name: str): def get_azure_params(deployment_name: str):
params = { params = {
@ -797,22 +798,26 @@ def test_usage_based_routing_fallbacks():
{ {
"model_name": "azure/gpt-4-fast", "model_name": "azure/gpt-4-fast",
"litellm_params": get_azure_params("chatgpt-v-2"), "litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_FAST_TPM, "model_info": {"id": 1},
"rpm": AZURE_FAST_RPM,
}, },
{ {
"model_name": "azure/gpt-4-basic", "model_name": "azure/gpt-4-basic",
"litellm_params": get_azure_params("chatgpt-v-2"), "litellm_params": get_azure_params("chatgpt-v-2"),
"tpm": AZURE_BASIC_TPM, "model_info": {"id": 2},
"rpm": AZURE_BASIC_RPM,
}, },
{ {
"model_name": "openai-gpt-4", "model_name": "openai-gpt-4",
"litellm_params": get_openai_params("gpt-3.5-turbo"), "litellm_params": get_openai_params("gpt-3.5-turbo"),
"tpm": OPENAI_TPM, "model_info": {"id": 3},
"rpm": OPENAI_RPM,
}, },
{ {
"model_name": "anthropic-claude-instant-1.2", "model_name": "anthropic-claude-instant-1.2",
"litellm_params": get_anthropic_params("claude-instant-1.2"), "litellm_params": get_anthropic_params("claude-instant-1.2"),
"tpm": ANTHROPIC_TPM, "model_info": {"id": 4},
"rpm": ANTHROPIC_RPM,
}, },
] ]
# litellm.set_verbose=True # litellm.set_verbose=True
@ -830,6 +835,7 @@ def test_usage_based_routing_fallbacks():
routing_strategy="usage-based-routing", routing_strategy="usage-based-routing",
redis_host=os.environ["REDIS_HOST"], redis_host=os.environ["REDIS_HOST"],
redis_port=os.environ["REDIS_PORT"], redis_port=os.environ["REDIS_PORT"],
num_retries=0,
) )
messages = [ messages = [
@ -842,10 +848,10 @@ def test_usage_based_routing_fallbacks():
mock_response="very nice to meet you", mock_response="very nice to meet you",
) )
print("response: ", response) print("response: ", response)
print("response._hidden_params: ", response._hidden_params) print(f"response._hidden_params: {response._hidden_params}")
# in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
# the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
assert response._hidden_params["custom_llm_provider"] == "openai" assert response._hidden_params["model_id"] == "1"
# now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2 # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
for i in range(20): for i in range(20):
@ -859,7 +865,7 @@ def test_usage_based_routing_fallbacks():
print("response._hidden_params: ", response._hidden_params) print("response._hidden_params: ", response._hidden_params)
if i == 19: if i == 19:
# by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2 # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
assert response._hidden_params["custom_llm_provider"] == "anthropic" assert response._hidden_params["model_id"] == "4"
except Exception as e: except Exception as e:
pytest.fail(f"An exception occurred {e}") pytest.fail(f"An exception occurred {e}")

View file

@ -203,7 +203,7 @@ def test_timeouts_router():
}, },
}, },
] ]
router = Router(model_list=model_list) router = Router(model_list=model_list, num_retries=0)
print("PASSED !") print("PASSED !")
@ -396,7 +396,9 @@ def test_router_init_gpt_4_vision_enhancements():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_openai_with_organization(): @pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_openai_with_organization(sync_mode):
try: try:
print("Testing OpenAI with organization") print("Testing OpenAI with organization")
model_list = [ model_list = [
@ -418,32 +420,65 @@ def test_openai_with_organization():
print(router.model_list) print(router.model_list)
print(router.model_list[0]) print(router.model_list[0])
openai_client = router._get_client( if sync_mode:
deployment=router.model_list[0], openai_client = router._get_client(
kwargs={"input": ["hello"], "model": "openai-bad-org"}, deployment=router.model_list[0],
) kwargs={"input": ["hello"], "model": "openai-bad-org"},
print(vars(openai_client))
assert openai_client.organization == "org-ikDc4ex8NB"
# bad org raises error
try:
response = router.completion(
model="openai-bad-org",
messages=[{"role": "user", "content": "this is a test"}],
) )
pytest.fail("Request should have failed - This organization does not exist") print(vars(openai_client))
except Exception as e:
print("Got exception: " + str(e))
assert "No such organization: org-ikDc4ex8NB" in str(e)
# good org works assert openai_client.organization == "org-ikDc4ex8NB"
response = router.completion(
model="openai-good-org", # bad org raises error
messages=[{"role": "user", "content": "this is a test"}],
max_tokens=5, try:
) response = router.completion(
model="openai-bad-org",
messages=[{"role": "user", "content": "this is a test"}],
)
pytest.fail(
"Request should have failed - This organization does not exist"
)
except Exception as e:
print("Got exception: " + str(e))
assert "No such organization: org-ikDc4ex8NB" in str(e)
# good org works
response = router.completion(
model="openai-good-org",
messages=[{"role": "user", "content": "this is a test"}],
max_tokens=5,
)
else:
openai_client = router._get_client(
deployment=router.model_list[0],
kwargs={"input": ["hello"], "model": "openai-bad-org"},
client_type="async",
)
print(vars(openai_client))
assert openai_client.organization == "org-ikDc4ex8NB"
# bad org raises error
try:
response = await router.acompletion(
model="openai-bad-org",
messages=[{"role": "user", "content": "this is a test"}],
)
pytest.fail(
"Request should have failed - This organization does not exist"
)
except Exception as e:
print("Got exception: " + str(e))
assert "No such organization: org-ikDc4ex8NB" in str(e)
# good org works
response = await router.acompletion(
model="openai-good-org",
messages=[{"role": "user", "content": "this is a test"}],
max_tokens=5,
)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")

View file

@ -0,0 +1,121 @@
#### What this tests ####
# This tests calling router with fallback models
import sys, os, time
import traceback, asyncio
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from litellm import Router
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
success: bool = False
failure: bool = False
previous_models: int = 0
def log_pre_api_call(self, model, messages, kwargs):
print(f"Pre-API Call")
print(
f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
)
self.previous_models = len(
kwargs["litellm_params"]["metadata"].get("previous_models", [])
) # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
print(f"self.previous_models: {self.previous_models}")
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
print(
f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}"
)
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream")
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
"""
Test sync + async
- Authorization Errors
- Random API Error
"""
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"])
@pytest.mark.asyncio
async def test_router_retries_errors(sync_mode, error_type):
"""
- Auth Error -> 0 retries
- API Error -> 2 retries
"""
_api_key = (
"bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY")
)
print(f"_api_key: {_api_key}")
model_list = [
{
"model_name": "azure/gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": _api_key,
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 240000,
"rpm": 1800,
},
]
router = Router(model_list=model_list, allowed_fails=3)
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}]
kwargs = {
"model": "azure/gpt-3.5-turbo",
"messages": messages,
"mock_response": (
None
if error_type == "Authorization Error"
else Exception("Invalid Request")
),
}
try:
if sync_mode:
response = router.completion(**kwargs)
else:
response = await router.acompletion(**kwargs)
except Exception as e:
pass
await asyncio.sleep(
0.05
) # allow a delay as success_callbacks are on a separate thread
print(f"customHandler.previous_models: {customHandler.previous_models}")
if error_type == "Authorization Error":
assert customHandler.previous_models == 0 # 0 retries
else:
assert customHandler.previous_models == 2 # 2 retries

View file

@ -57,6 +57,7 @@ def test_router_timeouts():
redis_password=os.getenv("REDIS_PASSWORD"), redis_password=os.getenv("REDIS_PASSWORD"),
redis_port=int(os.getenv("REDIS_PORT")), redis_port=int(os.getenv("REDIS_PORT")),
timeout=10, timeout=10,
num_retries=0,
) )
print("***** TPM SETTINGS *****") print("***** TPM SETTINGS *****")
@ -89,15 +90,15 @@ def test_router_timeouts():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_router_timeouts_bedrock(): async def test_router_timeouts_bedrock():
import openai import openai, uuid
# Model list for OpenAI and Anthropic models # Model list for OpenAI and Anthropic models
model_list = [ _model_list = [
{ {
"model_name": "bedrock", "model_name": "bedrock",
"litellm_params": { "litellm_params": {
"model": "bedrock/anthropic.claude-instant-v1", "model": "bedrock/anthropic.claude-instant-v1",
"timeout": 0.001, "timeout": 0.00001,
}, },
"tpm": 80000, "tpm": 80000,
}, },
@ -105,17 +106,18 @@ async def test_router_timeouts_bedrock():
# Configure router # Configure router
router = Router( router = Router(
model_list=model_list, model_list=_model_list,
routing_strategy="usage-based-routing", routing_strategy="usage-based-routing",
debug_level="DEBUG", debug_level="DEBUG",
set_verbose=True, set_verbose=True,
num_retries=0,
) )
litellm.set_verbose = True litellm.set_verbose = True
try: try:
response = await router.acompletion( response = await router.acompletion(
model="bedrock", model="bedrock",
messages=[{"role": "user", "content": "hello, who are u"}], messages=[{"role": "user", "content": f"hello, who are u {uuid.uuid4()}"}],
) )
print(response) print(response)
pytest.fail("Did not raise error `openai.APITimeoutError`") pytest.fail("Did not raise error `openai.APITimeoutError`")

View file

@ -518,7 +518,7 @@ async def test_acompletion_gemini_stream():
litellm.set_verbose = True litellm.set_verbose = True
print("Streaming gemini response") print("Streaming gemini response")
messages = [ messages = [
{"role": "system", "content": "You are a helpful assistant."}, # {"role": "system", "content": "You are a helpful assistant."},
{ {
"role": "user", "role": "user",
"content": "What do you know?", "content": "What do you know?",
@ -1271,6 +1271,33 @@ def test_completion_sagemaker_stream():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_watsonx_stream():
litellm.set_verbose = True
try:
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=messages,
temperature=0.5,
max_tokens=20,
stream=True,
)
complete_response = ""
has_finish_reason = False
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
has_finish_reason = finished
if finished:
break
complete_response += chunk
if has_finish_reason is False:
raise Exception("finish reason not set for last chunk")
if complete_response.strip() == "":
raise Exception("Empty response received")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_completion_sagemaker_stream() # test_completion_sagemaker_stream()
@ -2446,6 +2473,34 @@ class ModelResponseIterator:
return self.model_response return self.model_response
class ModelResponseListIterator:
def __init__(self, model_responses):
self.model_responses = model_responses
self.index = 0
# Sync iterator
def __iter__(self):
return self
def __next__(self):
if self.index >= len(self.model_responses):
raise StopIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
# Async iterator
def __aiter__(self):
return self
async def __anext__(self):
if self.index >= len(self.model_responses):
raise StopAsyncIteration
model_response = self.model_responses[self.index]
self.index += 1
return model_response
def test_unit_test_custom_stream_wrapper(): def test_unit_test_custom_stream_wrapper():
""" """
Test if last streaming chunk ends with '?', if the message repeats itself. Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2541,268 @@ def test_unit_test_custom_stream_wrapper():
if "How are you?" in chunk.choices[0].delta.content: if "How are you?" in chunk.choices[0].delta.content:
freq += 1 freq += 1
assert freq == 1 assert freq == 1
def test_aamazing_unit_test_custom_stream_wrapper_n():
"""
Test if the translated output maps exactly to the received openai input
Relevant issue: https://github.com/BerriAI/litellm/issues/3276
"""
chunks = [
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "It"},
"logprobs": {
"content": [
{
"token": "It",
"logprob": -1.5952516,
"bytes": [73, 116],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 1,
"delta": {"content": "Brown"},
"logprobs": {
"content": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
"top_logprobs": [
{
"token": "Brown",
"logprob": -0.7358765,
"bytes": [66, 114, 111, 119, 110],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "'s"},
"logprobs": {
"content": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
"top_logprobs": [
{
"token": "'s",
"logprob": -0.006786893,
"bytes": [39, 115],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": " impossible"},
"logprobs": {
"content": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
"top_logprobs": [
{
"token": " impossible",
"logprob": -0.06528423,
"bytes": [
32,
105,
109,
112,
111,
115,
115,
105,
98,
108,
101,
],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"delta": {"content": "—even"},
"logprobs": {
"content": [
{
"token": "—even",
"logprob": -9999.0,
"bytes": [226, 128, 148, 101, 118, 101, 110],
"top_logprobs": [
{
"token": " to",
"logprob": -0.12302828,
"bytes": [32, 116, 111],
}
],
}
]
},
"finish_reason": None,
}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
],
},
{
"id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
"object": "chat.completion.chunk",
"created": 1714075272,
"model": "gpt-4-0613",
"system_fingerprint": None,
"choices": [
{"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
],
},
]
litellm.set_verbose = True
chunk_list = []
for chunk in chunks:
new_chunk = litellm.ModelResponse(stream=True, id=chunk["id"])
if "choices" in chunk and isinstance(chunk["choices"], list):
print("INSIDE CHUNK CHOICES!")
new_choices = []
for choice in chunk["choices"]:
if isinstance(choice, litellm.utils.StreamingChoices):
_new_choice = choice
elif isinstance(choice, dict):
_new_choice = litellm.utils.StreamingChoices(**choice)
new_choices.append(_new_choice)
new_chunk.choices = new_choices
chunk_list.append(new_chunk)
completion_stream = ModelResponseListIterator(model_responses=chunk_list)
response = litellm.CustomStreamWrapper(
completion_stream=completion_stream,
model="gpt-4-0613",
custom_llm_provider="cached_response",
logging_obj=litellm.Logging(
model="gpt-4-0613",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
),
)
for idx, chunk in enumerate(response):
chunk_dict = {}
try:
chunk_dict = chunk.model_dump(exclude_none=True)
except:
chunk_dict = chunk.dict(exclude_none=True)
chunk_dict.pop("created")
chunks[idx].pop("created")
if chunks[idx]["system_fingerprint"] is None:
chunks[idx].pop("system_fingerprint", None)
if idx == 0:
for choice in chunk_dict["choices"]:
if "role" in choice["delta"]:
choice["delta"].pop("role")
for choice in chunks[idx]["choices"]:
# ignore finish reason None - since our pydantic object is set to exclude_none = true
if "finish_reason" in choice and choice["finish_reason"] is None:
choice.pop("finish_reason")
if "logprobs" in choice and choice["logprobs"] is None:
choice.pop("logprobs")
assert (
chunk_dict == chunks[idx]
), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"

View file

@ -78,7 +78,8 @@ def test_hanging_request_azure():
"model_name": "openai-gpt", "model_name": "openai-gpt",
"litellm_params": {"model": "gpt-3.5-turbo"}, "litellm_params": {"model": "gpt-3.5-turbo"},
}, },
] ],
num_retries=0,
) )
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0] encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -131,7 +132,8 @@ def test_hanging_request_openai():
"model_name": "openai-gpt", "model_name": "openai-gpt",
"litellm_params": {"model": "gpt-3.5-turbo"}, "litellm_params": {"model": "gpt-3.5-turbo"},
}, },
] ],
num_retries=0,
) )
encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0] encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -189,6 +191,7 @@ def test_timeout_streaming():
# test_timeout_streaming() # test_timeout_streaming()
@pytest.mark.skip(reason="local test")
def test_timeout_ollama(): def test_timeout_ollama():
# this Will Raise a timeout # this Will Raise a timeout
import litellm import litellm

View file

@ -282,6 +282,64 @@ def test_router_skip_rate_limited_deployments():
print(f"An exception occurred! {str(e)}") print(f"An exception occurred! {str(e)}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_multiple_potential_deployments(sync_mode):
"""
If multiple deployments have the same tpm value
call 5 times, test if deployments are shuffled.
-> prevents single deployment from being overloaded in high-concurrency scenario
"""
model_list = [
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"tpm": 1440,
},
},
{
"model_name": "azure-model",
"litellm_params": {
"model": "azure/gpt-turbo-2",
"api_key": "os.environ/AZURE_FRANCE_API_KEY",
"api_base": "https://openai-france-1234.openai.azure.com",
"tpm": 1440,
},
},
]
router = Router(
model_list=model_list,
routing_strategy="usage-based-routing-v2",
set_verbose=False,
num_retries=3,
) # type: ignore
model_ids = set()
for _ in range(1000):
if sync_mode:
deployment = router.get_available_deployment(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
else:
deployment = await router.async_get_available_deployment(
model="azure-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
## get id ##
id = deployment.get("model_info", {}).get("id")
model_ids.add(id)
assert len(model_ids) == 2
def test_single_deployment_tpm_zero(): def test_single_deployment_tpm_zero():
import litellm import litellm
import os import os

View file

@ -1,5 +1,5 @@
from typing import List, Optional, Union, Dict, Tuple, Literal from typing import List, Optional, Union, Dict, Tuple, Literal
import httpx
from pydantic import BaseModel, validator from pydantic import BaseModel, validator
from .completion import CompletionRequest from .completion import CompletionRequest
from .embedding import EmbeddingRequest from .embedding import EmbeddingRequest
@ -104,11 +104,13 @@ class LiteLLM_Params(BaseModel):
api_key: Optional[str] = None api_key: Optional[str] = None
api_base: Optional[str] = None api_base: Optional[str] = None
api_version: Optional[str] = None api_version: Optional[str] = None
timeout: Optional[Union[float, str]] = None # if str, pass in as os.environ/ timeout: Optional[Union[float, str, httpx.Timeout]] = (
None # if str, pass in as os.environ/
)
stream_timeout: Optional[Union[float, str]] = ( stream_timeout: Optional[Union[float, str]] = (
None # timeout when making stream=True calls, if str, pass in as os.environ/ None # timeout when making stream=True calls, if str, pass in as os.environ/
) )
max_retries: int = 2 # follows openai default of 2 max_retries: Optional[int] = None
organization: Optional[str] = None # for openai orgs organization: Optional[str] = None # for openai orgs
## VERTEX AI ## ## VERTEX AI ##
vertex_project: Optional[str] = None vertex_project: Optional[str] = None
@ -146,14 +148,13 @@ class LiteLLM_Params(BaseModel):
args.pop("self", None) args.pop("self", None)
args.pop("params", None) args.pop("params", None)
args.pop("__class__", None) args.pop("__class__", None)
if max_retries is None: if max_retries is not None and isinstance(max_retries, str):
max_retries = 2
elif isinstance(max_retries, str):
max_retries = int(max_retries) # cast to int max_retries = int(max_retries) # cast to int
super().__init__(max_retries=max_retries, **args, **params) super().__init__(max_retries=max_retries, **args, **params)
class Config: class Config:
extra = "allow" extra = "allow"
arbitrary_types_allowed = True
def __contains__(self, key): def __contains__(self, key):
# Define custom behavior for the 'in' operator # Define custom behavior for the 'in' operator
@ -201,6 +202,9 @@ class updateDeployment(BaseModel):
litellm_params: Optional[updateLiteLLMParams] = None litellm_params: Optional[updateLiteLLMParams] = None
model_info: Optional[ModelInfo] = None model_info: Optional[ModelInfo] = None
class Config:
protected_namespaces = ()
class Deployment(BaseModel): class Deployment(BaseModel):
model_name: str model_name: str
@ -259,3 +263,4 @@ class RouterErrors(enum.Enum):
""" """
user_defined_ratelimit_error = "Deployment over user-defined ratelimit." user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
no_deployments_available = "No deployments available for selected model"

View file

@ -19,6 +19,7 @@ from functools import wraps
import datetime, time import datetime, time
import tiktoken import tiktoken
import uuid import uuid
from pydantic import BaseModel
import aiohttp import aiohttp
import textwrap import textwrap
import logging import logging
@ -69,6 +70,7 @@ from .integrations.langsmith import LangsmithLogger
from .integrations.weights_biases import WeightsBiasesLogger from .integrations.weights_biases import WeightsBiasesLogger
from .integrations.custom_logger import CustomLogger from .integrations.custom_logger import CustomLogger
from .integrations.langfuse import LangFuseLogger from .integrations.langfuse import LangFuseLogger
from .integrations.openmeter import OpenMeterLogger
from .integrations.datadog import DataDogLogger from .integrations.datadog import DataDogLogger
from .integrations.prometheus import PrometheusLogger from .integrations.prometheus import PrometheusLogger
from .integrations.prometheus_services import PrometheusServicesLogger from .integrations.prometheus_services import PrometheusServicesLogger
@ -105,7 +107,7 @@ try:
except Exception as e: except Exception as e:
verbose_logger.debug(f"Exception import enterprise features {str(e)}") verbose_logger.debug(f"Exception import enterprise features {str(e)}")
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO, Iterable
from .caching import Cache from .caching import Cache
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -129,6 +131,7 @@ langsmithLogger = None
weightsBiasesLogger = None weightsBiasesLogger = None
customLogger = None customLogger = None
langFuseLogger = None langFuseLogger = None
openMeterLogger = None
dataDogLogger = None dataDogLogger = None
prometheusLogger = None prometheusLogger = None
dynamoLogger = None dynamoLogger = None
@ -219,6 +222,61 @@ def map_finish_reason(
return finish_reason return finish_reason
class TopLogprob(OpenAIObject):
token: str
"""The token."""
bytes: Optional[List[int]] = None
"""A list of integers representing the UTF-8 bytes representation of the token.
Useful in instances where characters are represented by multiple tokens and
their byte representations must be combined to generate the correct text
representation. Can be `null` if there is no bytes representation for the token.
"""
logprob: float
"""The log probability of this token, if it is within the top 20 most likely
tokens.
Otherwise, the value `-9999.0` is used to signify that the token is very
unlikely.
"""
class ChatCompletionTokenLogprob(OpenAIObject):
token: str
"""The token."""
bytes: Optional[List[int]] = None
"""A list of integers representing the UTF-8 bytes representation of the token.
Useful in instances where characters are represented by multiple tokens and
their byte representations must be combined to generate the correct text
representation. Can be `null` if there is no bytes representation for the token.
"""
logprob: float
"""The log probability of this token, if it is within the top 20 most likely
tokens.
Otherwise, the value `-9999.0` is used to signify that the token is very
unlikely.
"""
top_logprobs: List[TopLogprob]
"""List of the most likely tokens and their log probability, at this token
position.
In rare cases, there may be fewer than the number of requested `top_logprobs`
returned.
"""
class ChoiceLogprobs(OpenAIObject):
content: Optional[List[ChatCompletionTokenLogprob]] = None
"""A list of message content tokens with log probability information."""
class FunctionCall(OpenAIObject): class FunctionCall(OpenAIObject):
arguments: str arguments: str
name: Optional[str] = None name: Optional[str] = None
@ -320,19 +378,19 @@ class Message(OpenAIObject):
super(Message, self).__init__(**params) super(Message, self).__init__(**params)
self.content = content self.content = content
self.role = role self.role = role
self.tool_calls = None
self.function_call = None
if function_call is not None: if function_call is not None:
self.function_call = FunctionCall(**function_call) self.function_call = FunctionCall(**function_call)
if tool_calls is not None: if tool_calls is not None:
self.tool_calls = [] self.tool_calls = [
for tool_call in tool_calls: ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
if isinstance(tool_call, dict): ]
self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
else:
self.tool_calls.append(tool_call)
if logprobs is not None: if logprobs is not None:
self._logprobs = logprobs self._logprobs = ChoiceLogprobs(**logprobs)
def get(self, key, default=None): def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -355,12 +413,20 @@ class Message(OpenAIObject):
class Delta(OpenAIObject): class Delta(OpenAIObject):
tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
def __init__( def __init__(
self, content=None, role=None, function_call=None, tool_calls=None, **params self,
content=None,
role=None,
function_call=None,
tool_calls=None,
**params,
): ):
super(Delta, self).__init__(**params) super(Delta, self).__init__(**params)
self.content = content self.content = content
self.role = role self.role = role
if function_call is not None and isinstance(function_call, dict): if function_call is not None and isinstance(function_call, dict):
self.function_call = FunctionCall(**function_call) self.function_call = FunctionCall(**function_call)
else: else:
@ -410,7 +476,7 @@ class Choices(OpenAIObject):
) # set finish_reason for all responses ) # set finish_reason for all responses
self.index = index self.index = index
if message is None: if message is None:
self.message = Message(content=None) self.message = Message()
else: else:
if isinstance(message, Message): if isinstance(message, Message):
self.message = message self.message = message
@ -492,7 +558,11 @@ class StreamingChoices(OpenAIObject):
self.delta = Delta() self.delta = Delta()
if enhancements is not None: if enhancements is not None:
self.enhancements = enhancements self.enhancements = enhancements
self.logprobs = logprobs
if logprobs is not None and isinstance(logprobs, dict):
self.logprobs = ChoiceLogprobs(**logprobs)
else:
self.logprobs = logprobs # type: ignore
def __contains__(self, key): def __contains__(self, key):
# Define custom behavior for the 'in' operator # Define custom behavior for the 'in' operator
@ -1139,7 +1209,14 @@ class Logging:
if verbose_logger.level == 0: if verbose_logger.level == 0:
# this means verbose logger was not switched on - user is in litellm.set_verbose=True # this means verbose logger was not switched on - user is in litellm.set_verbose=True
print_verbose(f"\033[92m{curl_command}\033[0m\n") print_verbose(f"\033[92m{curl_command}\033[0m\n")
verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
if litellm.json_logs:
verbose_logger.info(
"POST Request Sent from LiteLLM",
extra={"api_base": {api_base}, **masked_headers},
)
else:
verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
if self.logger_fn and callable(self.logger_fn): if self.logger_fn and callable(self.logger_fn):
try: try:
self.logger_fn( self.logger_fn(
@ -1149,7 +1226,6 @@ class Logging:
print_verbose( print_verbose(
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}" f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
) )
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
callbacks = litellm.input_callback + self.dynamic_input_callbacks callbacks = litellm.input_callback + self.dynamic_input_callbacks
for callback in callbacks: for callback in callbacks:
@ -1166,29 +1242,20 @@ class Logging:
litellm_call_id=self.litellm_params["litellm_call_id"], litellm_call_id=self.litellm_params["litellm_call_id"],
print_verbose=print_verbose, print_verbose=print_verbose,
) )
elif callback == "lite_debugger":
print_verbose(
f"reaches litedebugger for logging! - model_call_details {self.model_call_details}"
)
model = self.model_call_details["model"]
messages = self.model_call_details["input"]
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
liteDebuggerClient.input_log_event(
model=model,
messages=messages,
end_user=self.model_call_details.get("user", "default"),
litellm_call_id=self.litellm_params["litellm_call_id"],
litellm_params=self.model_call_details["litellm_params"],
optional_params=self.model_call_details["optional_params"],
print_verbose=print_verbose,
call_type=self.call_type,
)
elif callback == "sentry" and add_breadcrumb: elif callback == "sentry" and add_breadcrumb:
print_verbose("reaches sentry breadcrumbing") try:
details_to_log = copy.deepcopy(self.model_call_details)
except:
details_to_log = self.model_call_details
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb( add_breadcrumb(
category="litellm.llm_call", category="litellm.llm_call",
message=f"Model Call Details pre-call: {self.model_call_details}", message=f"Model Call Details pre-call: {details_to_log}",
level="info", level="info",
) )
elif isinstance(callback, CustomLogger): # custom logger class elif isinstance(callback, CustomLogger): # custom logger class
@ -1252,7 +1319,7 @@ class Logging:
print_verbose( print_verbose(
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}" f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
) )
self.redact_message_input_output_from_logging(result=original_response)
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
callbacks = litellm.input_callback + self.dynamic_input_callbacks callbacks = litellm.input_callback + self.dynamic_input_callbacks
@ -1270,9 +1337,19 @@ class Logging:
) )
elif callback == "sentry" and add_breadcrumb: elif callback == "sentry" and add_breadcrumb:
print_verbose("reaches sentry breadcrumbing") print_verbose("reaches sentry breadcrumbing")
try:
details_to_log = copy.deepcopy(self.model_call_details)
except:
details_to_log = self.model_call_details
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb( add_breadcrumb(
category="litellm.llm_call", category="litellm.llm_call",
message=f"Model Call Details post-call: {self.model_call_details}", message=f"Model Call Details post-call: {details_to_log}",
level="info", level="info",
) )
elif isinstance(callback, CustomLogger): # custom logger class elif isinstance(callback, CustomLogger): # custom logger class
@ -1464,6 +1541,8 @@ class Logging:
else: else:
callbacks = litellm.success_callback callbacks = litellm.success_callback
self.redact_message_input_output_from_logging(result=result)
for callback in callbacks: for callback in callbacks:
try: try:
litellm_params = self.model_call_details.get("litellm_params", {}) litellm_params = self.model_call_details.get("litellm_params", {})
@ -1850,6 +1929,51 @@ class Logging:
end_time=end_time, end_time=end_time,
print_verbose=print_verbose, print_verbose=print_verbose,
) )
if (
callback == "openmeter"
and self.model_call_details.get("litellm_params", {}).get(
"acompletion", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"aembedding", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"aimage_generation", False
)
== False
and self.model_call_details.get("litellm_params", {}).get(
"atranscription", False
)
== False
):
global openMeterLogger
if openMeterLogger is None:
print_verbose("Instantiates openmeter client")
openMeterLogger = OpenMeterLogger()
if self.stream and complete_streaming_response is None:
openMeterLogger.log_stream_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
else:
if self.stream and complete_streaming_response:
self.model_call_details["complete_response"] = (
self.model_call_details.get(
"complete_streaming_response", {}
)
)
result = self.model_call_details["complete_response"]
openMeterLogger.log_success_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
if ( if (
isinstance(callback, CustomLogger) isinstance(callback, CustomLogger)
and self.model_call_details.get("litellm_params", {}).get( and self.model_call_details.get("litellm_params", {}).get(
@ -2008,7 +2132,9 @@ class Logging:
callbacks.append(callback) callbacks.append(callback)
else: else:
callbacks = litellm._async_success_callback callbacks = litellm._async_success_callback
print_verbose(f"Async success callbacks: {callbacks}")
self.redact_message_input_output_from_logging(result=result)
for callback in callbacks: for callback in callbacks:
# check if callback can run for this request # check if callback can run for this request
litellm_params = self.model_call_details.get("litellm_params", {}) litellm_params = self.model_call_details.get("litellm_params", {})
@ -2046,6 +2172,35 @@ class Logging:
await litellm.cache.async_add_cache(result, **kwargs) await litellm.cache.async_add_cache(result, **kwargs)
else: else:
litellm.cache.add_cache(result, **kwargs) litellm.cache.add_cache(result, **kwargs)
if callback == "openmeter":
global openMeterLogger
if self.stream == True:
if (
"async_complete_streaming_response"
in self.model_call_details
):
await openMeterLogger.async_log_success_event(
kwargs=self.model_call_details,
response_obj=self.model_call_details[
"async_complete_streaming_response"
],
start_time=start_time,
end_time=end_time,
)
else:
await openMeterLogger.async_log_stream_event( # [TODO]: move this to being an async log stream event function
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
else:
await openMeterLogger.async_log_success_event(
kwargs=self.model_call_details,
response_obj=result,
start_time=start_time,
end_time=end_time,
)
if isinstance(callback, CustomLogger): # custom logger class if isinstance(callback, CustomLogger): # custom logger class
if self.stream == True: if self.stream == True:
if ( if (
@ -2169,7 +2324,10 @@ class Logging:
start_time=start_time, start_time=start_time,
end_time=end_time, end_time=end_time,
) )
result = None # result sent to all loggers, init this to None incase it's not created result = None # result sent to all loggers, init this to None incase it's not created
self.redact_message_input_output_from_logging(result=result)
for callback in litellm.failure_callback: for callback in litellm.failure_callback:
try: try:
if callback == "lite_debugger": if callback == "lite_debugger":
@ -2354,6 +2512,39 @@ class Logging:
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}" f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
) )
def redact_message_input_output_from_logging(self, result):
"""
Removes messages, prompts, input, response from logging. This modifies the data in-place
only redacts when litellm.turn_off_message_logging == True
"""
# check if user opted out of logging message/response to callbacks
if litellm.turn_off_message_logging == True:
# remove messages, prompts, input, response from logging
self.model_call_details["messages"] = "redacted-by-litellm"
self.model_call_details["prompt"] = ""
self.model_call_details["input"] = ""
# response cleaning
# ChatCompletion Responses
if self.stream and "complete_streaming_response" in self.model_call_details:
_streaming_response = self.model_call_details[
"complete_streaming_response"
]
for choice in _streaming_response.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
else:
if result is not None:
if isinstance(result, litellm.ModelResponse):
if hasattr(result, "choices") and result.choices is not None:
for choice in result.choices:
if isinstance(choice, litellm.Choices):
choice.message.content = "redacted-by-litellm"
elif isinstance(choice, litellm.utils.StreamingChoices):
choice.delta.content = "redacted-by-litellm"
def exception_logging( def exception_logging(
additional_args={}, additional_args={},
@ -2436,7 +2627,7 @@ class Rules:
####### CLIENT ################### ####### CLIENT ###################
# make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
def function_setup( def function_setup(
original_function, rules_obj, start_time, *args, **kwargs original_function: str, rules_obj, start_time, *args, **kwargs
): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc. ): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
try: try:
global callback_list, add_breadcrumb, user_logger_fn, Logging global callback_list, add_breadcrumb, user_logger_fn, Logging
@ -2460,10 +2651,12 @@ def function_setup(
len(litellm.input_callback) > 0 len(litellm.input_callback) > 0
or len(litellm.success_callback) > 0 or len(litellm.success_callback) > 0
or len(litellm.failure_callback) > 0 or len(litellm.failure_callback) > 0
) and len(callback_list) == 0: ) and len(
callback_list # type: ignore
) == 0: # type: ignore
callback_list = list( callback_list = list(
set( set(
litellm.input_callback litellm.input_callback # type: ignore
+ litellm.success_callback + litellm.success_callback
+ litellm.failure_callback + litellm.failure_callback
) )
@ -2472,7 +2665,7 @@ def function_setup(
## ASYNC CALLBACKS ## ASYNC CALLBACKS
if len(litellm.input_callback) > 0: if len(litellm.input_callback) > 0:
removed_async_items = [] removed_async_items = []
for index, callback in enumerate(litellm.input_callback): for index, callback in enumerate(litellm.input_callback): # type: ignore
if inspect.iscoroutinefunction(callback): if inspect.iscoroutinefunction(callback):
litellm._async_input_callback.append(callback) litellm._async_input_callback.append(callback)
removed_async_items.append(index) removed_async_items.append(index)
@ -2483,11 +2676,11 @@ def function_setup(
if len(litellm.success_callback) > 0: if len(litellm.success_callback) > 0:
removed_async_items = [] removed_async_items = []
for index, callback in enumerate(litellm.success_callback): for index, callback in enumerate(litellm.success_callback): # type: ignore
if inspect.iscoroutinefunction(callback): if inspect.iscoroutinefunction(callback):
litellm._async_success_callback.append(callback) litellm._async_success_callback.append(callback)
removed_async_items.append(index) removed_async_items.append(index)
elif callback == "dynamodb": elif callback == "dynamodb" or callback == "openmeter":
# dynamo is an async callback, it's used for the proxy and needs to be async # dynamo is an async callback, it's used for the proxy and needs to be async
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
litellm._async_success_callback.append(callback) litellm._async_success_callback.append(callback)
@ -2499,7 +2692,7 @@ def function_setup(
if len(litellm.failure_callback) > 0: if len(litellm.failure_callback) > 0:
removed_async_items = [] removed_async_items = []
for index, callback in enumerate(litellm.failure_callback): for index, callback in enumerate(litellm.failure_callback): # type: ignore
if inspect.iscoroutinefunction(callback): if inspect.iscoroutinefunction(callback):
litellm._async_failure_callback.append(callback) litellm._async_failure_callback.append(callback)
removed_async_items.append(index) removed_async_items.append(index)
@ -2533,16 +2726,26 @@ def function_setup(
dynamic_success_callbacks = kwargs.pop("success_callback") dynamic_success_callbacks = kwargs.pop("success_callback")
if add_breadcrumb: if add_breadcrumb:
try:
details_to_log = copy.deepcopy(kwargs)
except:
details_to_log = kwargs
if litellm.turn_off_message_logging:
# make a copy of the _model_Call_details and log it
details_to_log.pop("messages", None)
details_to_log.pop("input", None)
details_to_log.pop("prompt", None)
add_breadcrumb( add_breadcrumb(
category="litellm.llm_call", category="litellm.llm_call",
message=f"Positional Args: {args}, Keyword Args: {kwargs}", message=f"Positional Args: {args}, Keyword Args: {details_to_log}",
level="info", level="info",
) )
if "logger_fn" in kwargs: if "logger_fn" in kwargs:
user_logger_fn = kwargs["logger_fn"] user_logger_fn = kwargs["logger_fn"]
# INIT LOGGER - for user-specified integrations # INIT LOGGER - for user-specified integrations
model = args[0] if len(args) > 0 else kwargs.get("model", None) model = args[0] if len(args) > 0 else kwargs.get("model", None)
call_type = original_function.__name__ call_type = original_function
if ( if (
call_type == CallTypes.completion.value call_type == CallTypes.completion.value
or call_type == CallTypes.acompletion.value or call_type == CallTypes.acompletion.value
@ -2724,7 +2927,7 @@ def client(original_function):
try: try:
if logging_obj is None: if logging_obj is None:
logging_obj, kwargs = function_setup( logging_obj, kwargs = function_setup(
original_function, rules_obj, start_time, *args, **kwargs original_function.__name__, rules_obj, start_time, *args, **kwargs
) )
kwargs["litellm_logging_obj"] = logging_obj kwargs["litellm_logging_obj"] = logging_obj
@ -3033,7 +3236,7 @@ def client(original_function):
try: try:
if logging_obj is None: if logging_obj is None:
logging_obj, kwargs = function_setup( logging_obj, kwargs = function_setup(
original_function, rules_obj, start_time, *args, **kwargs original_function.__name__, rules_obj, start_time, *args, **kwargs
) )
kwargs["litellm_logging_obj"] = logging_obj kwargs["litellm_logging_obj"] = logging_obj
@ -3540,12 +3743,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
a100_80gb_price_per_second_public = ( a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now 0.001400 # assume all calls sent to A100 80GB for now
) )
if total_time == 0.0: if total_time == 0.0: # total time is in ms
start_time = completion_response["created"] start_time = completion_response["created"]
end_time = completion_response["ended"] end_time = completion_response["ended"]
total_time = end_time - start_time total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time return a100_80gb_price_per_second_public * total_time / 1000
def _select_tokenizer(model: str): def _select_tokenizer(model: str):
@ -3567,7 +3770,7 @@ def _select_tokenizer(model: str):
tokenizer = Tokenizer.from_str(json_str) tokenizer = Tokenizer.from_str(json_str)
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# llama2 # llama2
elif "llama-2" in model.lower(): elif "llama-2" in model.lower() or "replicate" in model.lower():
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# default - tiktoken # default - tiktoken
@ -4168,7 +4371,10 @@ def completion_cost(
model = get_model_params_and_category(model) model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running # replicate llms are calculate based on time for request running
# see https://replicate.com/pricing # see https://replicate.com/pricing
elif model in litellm.replicate_models or "replicate" in model: elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time) return get_replicate_completion_pricing(completion_response, total_time)
( (
@ -4554,7 +4760,36 @@ def get_optional_params(
k.startswith("vertex_") and custom_llm_provider != "vertex_ai" k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
): # allow dynamically setting vertex ai init logic ): # allow dynamically setting vertex ai init logic
continue continue
passed_params[k] = v passed_params[k] = v
optional_params = {}
common_auth_dict = litellm.common_cloud_provider_auth_params
if custom_llm_provider in common_auth_dict["providers"]:
"""
Check if params = ["project", "region_name", "token"]
and correctly translate for = ["azure", "vertex_ai", "watsonx", "aws"]
"""
if custom_llm_provider == "azure":
optional_params = litellm.AzureOpenAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
elif custom_llm_provider == "bedrock":
optional_params = (
litellm.AmazonBedrockGlobalConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
)
elif custom_llm_provider == "vertex_ai":
optional_params = litellm.VertexAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
elif custom_llm_provider == "watsonx":
optional_params = litellm.IBMWatsonXAIConfig().map_special_auth_params(
non_default_params=passed_params, optional_params=optional_params
)
default_params = { default_params = {
"functions": None, "functions": None,
"function_call": None, "function_call": None,
@ -4590,7 +4825,7 @@ def get_optional_params(
and v != default_params[k] and v != default_params[k]
) )
} }
optional_params = {}
## raise exception if function calling passed in for a provider that doesn't support it ## raise exception if function calling passed in for a provider that doesn't support it
if ( if (
"functions" in non_default_params "functions" in non_default_params
@ -5268,7 +5503,8 @@ def get_optional_params(
optional_params["tools"] = tools optional_params["tools"] = tools
if tool_choice is not None: if tool_choice is not None:
optional_params["tool_choice"] = tool_choice optional_params["tool_choice"] = tool_choice
if response_format is not None:
optional_params["response_format"] = response_format
# check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion # check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
safe_mode = passed_params.pop("safe_mode", None) safe_mode = passed_params.pop("safe_mode", None)
random_seed = passed_params.pop("random_seed", None) random_seed = passed_params.pop("random_seed", None)
@ -5280,6 +5516,7 @@ def get_optional_params(
optional_params["extra_body"] = ( optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param extra_body # openai client supports `extra_body` param
) )
elif custom_llm_provider == "groq": elif custom_llm_provider == "groq":
supported_params = get_supported_openai_params( supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider model=model, custom_llm_provider=custom_llm_provider
@ -5360,6 +5597,49 @@ def get_optional_params(
optional_params["extra_body"] = ( optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param extra_body # openai client supports `extra_body` param
) )
elif custom_llm_provider == "watsonx":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
optional_params["max_new_tokens"] = max_tokens
if stream:
optional_params["stream"] = stream
if temperature is not None:
optional_params["temperature"] = temperature
if top_p is not None:
optional_params["top_p"] = top_p
if frequency_penalty is not None:
optional_params["repetition_penalty"] = frequency_penalty
if seed is not None:
optional_params["random_seed"] = seed
if stop is not None:
optional_params["stop_sequences"] = stop
# WatsonX-only parameters
extra_body = {}
if "decoding_method" in passed_params:
extra_body["decoding_method"] = passed_params.pop("decoding_method")
if "min_tokens" in passed_params or "min_new_tokens" in passed_params:
extra_body["min_new_tokens"] = passed_params.pop(
"min_tokens", passed_params.pop("min_new_tokens")
)
if "top_k" in passed_params:
extra_body["top_k"] = passed_params.pop("top_k")
if "truncate_input_tokens" in passed_params:
extra_body["truncate_input_tokens"] = passed_params.pop(
"truncate_input_tokens"
)
if "length_penalty" in passed_params:
extra_body["length_penalty"] = passed_params.pop("length_penalty")
if "time_limit" in passed_params:
extra_body["time_limit"] = passed_params.pop("time_limit")
if "return_options" in passed_params:
extra_body["return_options"] = passed_params.pop("return_options")
optional_params["extra_body"] = (
extra_body # openai client supports `extra_body` param
)
else: # assume passing in params for openai/azure openai else: # assume passing in params for openai/azure openai
print_verbose( print_verbose(
f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}" f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
@ -5762,6 +6042,8 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
"frequency_penalty", "frequency_penalty",
"presence_penalty", "presence_penalty",
] ]
elif custom_llm_provider == "watsonx":
return litellm.IBMWatsonXAIConfig().get_supported_openai_params()
def get_formatted_prompt( def get_formatted_prompt(
@ -5989,6 +6271,8 @@ def get_llm_provider(
model in litellm.bedrock_models or model in litellm.bedrock_embedding_models model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
): ):
custom_llm_provider = "bedrock" custom_llm_provider = "bedrock"
elif model in litellm.watsonx_models:
custom_llm_provider = "watsonx"
# openai embeddings # openai embeddings
elif model in litellm.open_ai_embedding_models: elif model in litellm.open_ai_embedding_models:
custom_llm_provider = "openai" custom_llm_provider = "openai"
@ -6453,7 +6737,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ: if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
keys_in_environment = True keys_in_environment = True
else: else:
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"]) missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_LOCATION"])
elif custom_llm_provider == "huggingface": elif custom_llm_provider == "huggingface":
if "HUGGINGFACE_API_KEY" in os.environ: if "HUGGINGFACE_API_KEY" in os.environ:
keys_in_environment = True keys_in_environment = True
@ -6579,11 +6863,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
def set_callbacks(callback_list, function_id=None): def set_callbacks(callback_list, function_id=None):
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
try: try:
for callback in callback_list: for callback in callback_list:
print_verbose(f"callback: {callback}") print_verbose(f"init callback list: {callback}")
if callback == "sentry": if callback == "sentry":
try: try:
import sentry_sdk import sentry_sdk
@ -6646,6 +6930,8 @@ def set_callbacks(callback_list, function_id=None):
promptLayerLogger = PromptLayerLogger() promptLayerLogger = PromptLayerLogger()
elif callback == "langfuse": elif callback == "langfuse":
langFuseLogger = LangFuseLogger() langFuseLogger = LangFuseLogger()
elif callback == "openmeter":
openMeterLogger = OpenMeterLogger()
elif callback == "datadog": elif callback == "datadog":
dataDogLogger = DataDogLogger() dataDogLogger = DataDogLogger()
elif callback == "prometheus": elif callback == "prometheus":
@ -6982,6 +7268,7 @@ def convert_to_model_response_object(
end_time=None, end_time=None,
hidden_params: Optional[dict] = None, hidden_params: Optional[dict] = None,
): ):
received_args = locals()
try: try:
if response_type == "completion" and ( if response_type == "completion" and (
model_response_object is None model_response_object is None
@ -6993,6 +7280,11 @@ def convert_to_model_response_object(
# for returning cached responses, we need to yield a generator # for returning cached responses, we need to yield a generator
return convert_to_streaming_response(response_object=response_object) return convert_to_streaming_response(response_object=response_object)
choice_list = [] choice_list = []
assert response_object["choices"] is not None and isinstance(
response_object["choices"], Iterable
)
for idx, choice in enumerate(response_object["choices"]): for idx, choice in enumerate(response_object["choices"]):
message = Message( message = Message(
content=choice["message"].get("content", None), content=choice["message"].get("content", None),
@ -7036,9 +7328,10 @@ def convert_to_model_response_object(
model_response_object.model = response_object["model"] model_response_object.model = response_object["model"]
if start_time is not None and end_time is not None: if start_time is not None and end_time is not None:
model_response_object._response_ms = ( # type: ignore if isinstance(start_time, type(end_time)):
end_time - start_time model_response_object._response_ms = ( # type: ignore
).total_seconds() * 1000 end_time - start_time
).total_seconds() * 1000
if hidden_params is not None: if hidden_params is not None:
model_response_object._hidden_params = hidden_params model_response_object._hidden_params = hidden_params
@ -7113,7 +7406,9 @@ def convert_to_model_response_object(
model_response_object._hidden_params = hidden_params model_response_object._hidden_params = hidden_params
return model_response_object return model_response_object
except Exception as e: except Exception as e:
raise Exception(f"Invalid response object {traceback.format_exc()}") raise Exception(
f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
)
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
@ -7940,7 +8235,10 @@ def exception_type(
llm_provider="vertex_ai", llm_provider="vertex_ai",
response=original_exception.response, response=original_exception.response,
) )
elif "None Unknown Error." in error_str: elif (
"None Unknown Error." in error_str
or "Content has no parts." in error_str
):
exception_mapping_worked = True exception_mapping_worked = True
raise APIError( raise APIError(
message=f"VertexAIException - {error_str}", message=f"VertexAIException - {error_str}",
@ -9393,9 +9691,14 @@ class CustomStreamWrapper:
is_finished = True is_finished = True
finish_reason = str_line.choices[0].finish_reason finish_reason = str_line.choices[0].finish_reason
if finish_reason == "content_filter": if finish_reason == "content_filter":
error_message = json.dumps( if hasattr(str_line.choices[0], "content_filter_result"):
str_line.choices[0].content_filter_result error_message = json.dumps(
) str_line.choices[0].content_filter_result
)
else:
error_message = "Azure Response={}".format(
str(dict(str_line))
)
raise litellm.AzureOpenAIError( raise litellm.AzureOpenAIError(
status_code=400, message=error_message status_code=400, message=error_message
) )
@ -9683,6 +9986,39 @@ class CustomStreamWrapper:
"finish_reason": finish_reason, "finish_reason": finish_reason,
} }
def handle_watsonx_stream(self, chunk):
try:
if isinstance(chunk, dict):
parsed_response = chunk
elif isinstance(chunk, (str, bytes)):
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8")
if "generated_text" in chunk:
response = chunk.replace("data: ", "").strip()
parsed_response = json.loads(response)
else:
return {"text": "", "is_finished": False}
else:
print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
raise ValueError(
f"Unable to parse response. Original response: {chunk}"
)
results = parsed_response.get("results", [])
if len(results) > 0:
text = results[0].get("generated_text", "")
finish_reason = results[0].get("stop_reason")
is_finished = finish_reason != "not_finished"
return {
"text": text,
"is_finished": is_finished,
"finish_reason": finish_reason,
"prompt_tokens": results[0].get("input_token_count", None),
"completion_tokens": results[0].get("generated_token_count", None),
}
return {"text": "", "is_finished": False}
except Exception as e:
raise e
def model_response_creator(self): def model_response_creator(self):
model_response = ModelResponse(stream=True, model=self.model) model_response = ModelResponse(stream=True, model=self.model)
if self.response_id is not None: if self.response_id is not None:
@ -9938,6 +10274,11 @@ class CustomStreamWrapper:
print_verbose(f"completion obj content: {completion_obj['content']}") print_verbose(f"completion obj content: {completion_obj['content']}")
if response_obj["is_finished"]: if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"] self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "watsonx":
response_obj = self.handle_watsonx_stream(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "text-completion-openai": elif self.custom_llm_provider == "text-completion-openai":
response_obj = self.handle_openai_text_completion_chunk(chunk) response_obj = self.handle_openai_text_completion_chunk(chunk)
completion_obj["content"] = response_obj["text"] completion_obj["content"] = response_obj["text"]
@ -10123,12 +10464,23 @@ class CustomStreamWrapper:
model_response.id = original_chunk.id model_response.id = original_chunk.id
self.response_id = original_chunk.id self.response_id = original_chunk.id
if len(original_chunk.choices) > 0: if len(original_chunk.choices) > 0:
try: choices = []
delta = dict(original_chunk.choices[0].delta) for idx, choice in enumerate(original_chunk.choices):
print_verbose(f"original delta: {delta}") try:
model_response.choices[0].delta = Delta(**delta) if isinstance(choice, BaseModel):
except Exception as e: try:
model_response.choices[0].delta = Delta() choice_json = choice.model_dump()
except Exception as e:
choice_json = choice.dict()
choice_json.pop(
"finish_reason", None
) # for mistral etc. which return a value in their last chunk (not-openai compatible).
print_verbose(f"choice_json: {choice_json}")
choices.append(StreamingChoices(**choice_json))
except Exception as e:
choices.append(StreamingChoices())
print_verbose(f"choices in streaming: {choices}")
model_response.choices = choices
else: else:
return return
model_response.system_fingerprint = ( model_response.system_fingerprint = (
@ -10173,11 +10525,11 @@ class CustomStreamWrapper:
) )
self.holding_chunk = "" self.holding_chunk = ""
# if delta is None # if delta is None
is_delta_empty = self.is_delta_empty( _is_delta_empty = self.is_delta_empty(
delta=model_response.choices[0].delta delta=model_response.choices[0].delta
) )
if is_delta_empty: if _is_delta_empty:
# get any function call arguments # get any function call arguments
model_response.choices[0].finish_reason = map_finish_reason( model_response.choices[0].finish_reason = map_finish_reason(
finish_reason=self.received_finish_reason finish_reason=self.received_finish_reason

View file

@ -1418,6 +1418,123 @@
"litellm_provider": "replicate", "litellm_provider": "replicate",
"mode": "chat" "mode": "chat"
}, },
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-13b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000005,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-70b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-2-7b-chat": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-70b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000065,
"output_cost_per_token": 0.00000275,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/meta/llama-3-8b-instruct": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mistral-7b-instruct-v0.2": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000005,
"output_cost_per_token": 0.00000025,
"litellm_provider": "replicate",
"mode": "chat"
},
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
"max_tokens": 4096,
"max_input_tokens": 4096,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.000001,
"litellm_provider": "replicate",
"mode": "chat"
},
"openrouter/openai/gpt-3.5-turbo": { "openrouter/openai/gpt-3.5-turbo": {
"max_tokens": 4095, "max_tokens": 4095,
"input_cost_per_token": 0.0000015, "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
"litellm_provider": "openrouter", "litellm_provider": "openrouter",
"mode": "chat" "mode": "chat"
}, },
"openrouter/anthropic/claude-3-opus": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"litellm_provider": "openrouter",
"mode": "chat",
"supports_function_calling": true,
"tool_use_system_prompt_tokens": 395
},
"openrouter/google/palm-2-chat-bison": { "openrouter/google/palm-2-chat-bison": {
"max_tokens": 8000, "max_tokens": 8000,
"input_cost_per_token": 0.0000005, "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
"litellm_provider": "bedrock", "litellm_provider": "bedrock",
"mode": "chat" "mode": "chat"
}, },
"meta.llama3-8b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-70b-instruct-v1:0": {
"max_tokens": 8192,
"max_input_tokens": 8192,
"max_output_tokens": 8192,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77, "max_tokens": 77,
"max_input_tokens": 77, "max_input_tokens": 77,

View file

@ -61,14 +61,14 @@ model_list:
api_key: my-fake-key api_key: my-fake-key
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
stream_timeout: 0.001 stream_timeout: 0.001
rpm: 10 rpm: 100
- model_name: fake-openai-endpoint-3 - model_name: fake-openai-endpoint-3
litellm_params: litellm_params:
model: openai/my-fake-model-2 model: openai/my-fake-model-2
api_key: my-fake-key api_key: my-fake-key
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
stream_timeout: 0.001 stream_timeout: 0.001
rpm: 10 rpm: 100
- model_name: "*" - model_name: "*"
litellm_params: litellm_params:
model: openai/* model: openai/*

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.35.27" version = "1.35.36"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.35.27" version = "1.35.36"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

Some files were not shown because too many files have changed in this diff Show more