Merge branch 'main' into litellm_aws_kms_fixes

This commit is contained in:
Krish Dholakia 2024-06-10 20:17:34 -07:00 committed by GitHub
commit 4475d2e5b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 1293 additions and 483 deletions

View file

@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml
```bash
curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-H "Authorization: Bearer sk-1234"
```
**Create a Thread**
@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \
-d ''
```
**Get a Thread**
```bash
curl http://0.0.0.0:4000/v1/threads/{thread_id} \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234"
```
**Add Messages to the Thread**
```bash

View file

@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t
</TabItem>
</Tabs>
## Switch Cache On / Off Per LiteLLM Call
LiteLLM supports 4 cache-controls:
- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint.
- `no-store`: *Optional(bool)* When `True`, Will not cache the response.
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
<Tabs>
<TabItem value="no-cache" label="No-Cache">
Example usage `no-cache` - When `True`, Will not return a cached response
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"no-cache": True},
)
```
</TabItem>
<TabItem value="no-store" label="No-Store">
Example usage `no-store` - When `True`, Will not cache the response.
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"no-store": True},
)
```
</TabItem>
<TabItem value="ttl" label="ttl">
Example usage `ttl` - cache the response for 10 seconds
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"ttl": 10},
)
```
</TabItem>
<TabItem value="s-maxage" label="s-maxage">
Example usage `s-maxage` - Will only accept cached responses for 60 seconds
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"s-maxage": 60},
)
```
</TabItem>
</Tabs>
## Cache Context Manager - Enable, Disable, Update Cache

View file

@ -1,3 +1,5 @@
# llmcord.py
llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted.
Github: https://github.com/jakobdylanc/discord-llm-chatbot

View file

@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
<Image img={require('../../img/admin_ui_spend.png')} />
## API Endpoints to get Spend
#### Getting Spend Reports - To Charge Other Teams, API Keys
#### Getting Spend Reports - To Charge Other Teams, Customers
Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
Use the `/global/spend/report` endpoint to get daily spend report per
- team
- customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm)
<Tabs>
<TabItem value="per team" label="Spend Per Team">
##### Example Request
👉 Key Change: Specify `group_by=team`
```shell
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \
-H 'Authorization: Bearer sk-1234'
```
@ -254,6 +262,69 @@ Output from script
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="per customer" label="Spend Per Customer">
##### Example Request
👉 Key Change: Specify `group_by=customer`
```shell
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \
-H 'Authorization: Bearer sk-1234'
```
##### Example Response
```shell
[
{
"group_by_day": "2024-04-30T00:00:00+00:00",
"customers": [
{
"customer": "palantir",
"total_spend": 0.0015265,
"metadata": [ # see the spend by unique(key + model)
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "88dc28.." # the hashed api key
},
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "a73dc2.." # the hashed api key
},
{
"model": "chatgpt-v-2",
"spend": 0.000214,
"total_tokens": 122,
"api_key": "898c28.." # the hashed api key
},
{
"model": "gpt-3.5-turbo",
"spend": 0.0000825,
"total_tokens": 85,
"api_key": "84dc28.." # the hashed api key
}
]
}
]
}
]
```
</TabItem>
</Tabs>

View file

@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env:
```bash
export JSON_LOGS="True"
```
**OR**
Set `json_logs: true` in your yaml:
```yaml
litellm_settings:
json_logs: true
```
Start proxy
@ -50,3 +58,34 @@ $ litellm
```
The proxy will now all logs in json format.
## Control Log Output
Turn off fastapi's default 'INFO' logs
1. Turn on 'json logs'
```yaml
litellm_settings:
json_logs: true
```
2. Set `LITELLM_LOG` to 'ERROR'
Only get logs if an error occurs.
```bash
LITELLM_LOG="ERROR"
```
3. Start proxy
```bash
$ litellm
```
Expected Output:
```bash
# no info statements
```

View file

@ -2,11 +2,21 @@
Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).
:::info
Requires Enterprise License for usage.
:::tip
Requires Enterprise License, Get in touch with us [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## Set `LiteLLM-Changed-By` in request headers
## 1. Switch on audit Logs
Add `store_audit_logs` to your litellm config.yaml and then start the proxy
```shell
litellm_settings:
store_audit_logs: true
```
## 2. Set `LiteLLM-Changed-By` in request headers
Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management).
@ -26,7 +36,7 @@ curl -X POST 'http://0.0.0.0:4000/team/update' \
}'
```
## Emitted Audit Log
## 3. Emitted Audit Log
```bash
{

View file

@ -21,6 +21,7 @@ general_settings:
litellm_settings:
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
json_logs: true # Get debug logs in json format
```
Set slack webhook url in your env
@ -28,6 +29,11 @@ Set slack webhook url in your env
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
```
Turn off FASTAPI's default info logs
```bash
export LITELLM_LOG="ERROR"
```
:::info
Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)

View file

@ -2,18 +2,13 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🔥 Fallbacks, Retries, Timeouts, Load Balancing
# 🔥 Load Balancing, Fallbacks, Retries, Timeouts
Retry call with multiple instances of the same model.
If a call fails after num_retries, fall back to another model group.
If the error is a context window exceeded error, fall back to a larger model group (if given).
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
- Quick Start [load balancing](#test---load-balancing)
- Quick Start [client side fallbacks](#test---client-side-fallbacks)
## Quick Start - Load Balancing
### Step 1 - Set deployments on config
#### Step 1 - Set deployments on config
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
```yaml
@ -38,14 +33,49 @@ model_list:
rpm: 1440
```
### Step 2: Start Proxy with config
#### Step 2: Start Proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
### Step 3: Use proxy - Call a model group [Load Balancing]
Curl Command
### Test - Load Balancing
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
👉 Key Change: `model="gpt-3.5-turbo"`
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
]
)
print(response)
```
</TabItem>
<TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -56,32 +86,167 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
"role": "user",
"content": "what llm are you"
}
],
}
'
]
}'
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model="gpt-3.5-turbo",
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
### Usage - Call a specific model deployment
If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model`
</TabItem>
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
</Tabs>
```bash
### Test - Client Side Fallbacks
In this request the following will occur:
1. The request to `model="zephyr-beta"` will fail
2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo
👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="zephyr-beta",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
}
)
print(response)
```
</TabItem>
<TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "azure/gpt-turbo-small-ca",
"model": "zephyr-beta"",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
'
}'
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model="zephyr-beta",
extra_body={
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
}
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
## Fallbacks + Retries + Timeouts + Cooldowns
</TabItem>
</Tabs>
<!--
### Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
"messages": [
{"role": "user", "content": "what color is red"}
],
"mock_testing_fallbacks": true
}'
``` -->
## Advanced
### Fallbacks + Retries + Timeouts + Cooldowns
**Set via config**
```yaml
@ -114,44 +279,7 @@ litellm_settings:
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
```
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
"context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
"num_retries": 2,
"timeout": 10
}
'
```
### Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
"messages": [
{"role": "user", "content": "what color is red"}
],
"mock_testing_fallbacks": true
}'
```
## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
### Context Window Fallbacks (Pre-Call Checks + Fallbacks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
@ -287,7 +415,7 @@ print(response)
</Tabs>
## Advanced - EU-Region Filtering (Pre-Call Checks)
### EU-Region Filtering (Pre-Call Checks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
@ -350,7 +478,7 @@ print(response)
print(f"response.headers.get('x-litellm-model-api-base')")
```
## Advanced - Custom Timeouts, Stream Timeouts - Per Model
### Custom Timeouts, Stream Timeouts - Per Model
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
```yaml
model_list:
@ -379,7 +507,7 @@ $ litellm --config /path/to/config.yaml
```
## Advanced - Setting Dynamic Timeouts - Per Request
### Setting Dynamic Timeouts - Per Request
LiteLLM Proxy supports setting a `timeout` per request

View file

@ -255,6 +255,7 @@ const sidebars = {
"projects/GPT Migrate",
"projects/YiVal",
"projects/LiteLLM Proxy",
"projects/llm_cord",
],
},
],

View file

@ -709,6 +709,7 @@ all_embedding_models = (
openai_image_generation_models = ["dall-e-2", "dall-e-3"]
from .timeout import timeout
from .cost_calculator import completion_cost
from .utils import (
client,
exception_type,
@ -718,7 +719,6 @@ from .utils import (
create_pretrained_tokenizer,
create_tokenizer,
cost_per_token,
completion_cost,
supports_function_calling,
supports_parallel_function_calling,
supports_vision,

View file

@ -1,6 +1,7 @@
# What is this?
## File for 'response_cost' calculation in Logging
from typing import Optional, Union, Literal
from typing import Optional, Union, Literal, List
import litellm._logging
from litellm.utils import (
ModelResponse,
EmbeddingResponse,
@ -8,10 +9,281 @@ from litellm.utils import (
TranscriptionResponse,
TextCompletionResponse,
CallTypes,
completion_cost,
cost_per_token,
print_verbose,
CostPerToken,
token_counter,
)
import litellm
from litellm import verbose_logger
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name) -> str:
"""
Helper function for calculating together ai pricing.
Returns
- str - model pricing category if mapped else received model name
"""
import re
model_name = model_name.lower()
re_params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
params_match = params_match.replace("b", "")
if params_match is not None:
params_billion = float(params_match)
else:
return model_name
# Determine the category based on the number of parameters
if params_billion <= 4.0:
category = "together-ai-up-to-4b"
elif params_billion <= 8.0:
category = "together-ai-4.1b-8b"
elif params_billion <= 21.0:
category = "together-ai-8.1b-21b"
elif params_billion <= 41.0:
category = "together-ai-21.1b-41b"
elif params_billion <= 80.0:
category = "together-ai-41.1b-80b"
elif params_billion <= 110.0:
category = "together-ai-81.1b-110b"
if category is not None:
return category
return model_name
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
# see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = getattr(completion_response, "ended", time.time())
total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time / 1000
def completion_cost(
completion_response=None,
model: Optional[str] = None,
prompt="",
messages: List = [],
completion="",
total_time=0.0, # used for replicate, sagemaker
call_type: Literal[
"embedding",
"aembedding",
"completion",
"acompletion",
"atext_completion",
"text_completion",
"image_generation",
"aimage_generation",
"moderation",
"amoderation",
"atranscription",
"transcription",
"aspeech",
"speech",
] = "completion",
### REGION ###
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size=None,
quality=None,
n=None, # number of images
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
Parameters:
completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
[OPTIONAL PARAMS]
model (str): Optional. The name of the language model used in the completion calls
prompt (str): Optional. The input prompt passed to the llm
completion (str): Optional. The output completion text from the llm
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
Returns:
float: The cost in USD dollars for the completion based on the provided parameters.
Exceptions:
Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
Note:
- If completion_response is provided, the function extracts token information and the model name from it.
- If completion_response is not provided, the function calculates token counts based on the model and input text.
- The cost is calculated based on the model, prompt tokens, and completion tokens.
- For certain models containing "togethercomputer" in the name, prices are based on the model size.
- For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
"""
try:
if (
(call_type == "aimage_generation" or call_type == "image_generation")
and model is not None
and isinstance(model, str)
and len(model) == 0
and custom_llm_provider == "azure"
):
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
completion_tokens = 0
custom_llm_provider = None
if completion_response is not None:
# get input/output tokens from completion_response
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = model or completion_response.get(
"model", None
) # check if user passed an override for model, if it's none check completion_response['model']
if hasattr(completion_response, "_hidden_params"):
if (
completion_response._hidden_params.get("model", None) is not None
and len(completion_response._hidden_params["model"]) > 0
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
size = completion_response._hidden_params.get(
"optional_params", {}
).get(
"size", "1024-x-1024"
) # openai default
quality = completion_response._hidden_params.get(
"optional_params", {}
).get(
"quality", "standard"
) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get(
"n", 1
) # openai default
else:
if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages)
elif len(prompt) > 0:
prompt_tokens = token_counter(model=model, text=prompt)
completion_tokens = token_counter(model=model, text=completion)
if model is None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if (
call_type == CallTypes.image_generation.value
or call_type == CallTypes.aimage_generation.value
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
# https://cloud.google.com/vertex-ai/generative-ai/pricing
# Vertex Charges Flat $0.20 per image
return 0.020
# fix size to match naming convention
if "x" in size and "-x-" not in size:
size = size.replace("x", "-x-")
image_gen_model_name = f"{size}/{model}"
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size = size.split("-x-")
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(
f"Model={image_gen_model_name} not found in completion cost model map"
)
# Calculate cost based on prompt_tokens, completion_tokens
if (
"togethercomputer" in model
or "together_ai" in model
or custom_llm_provider == "together_ai"
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
if model is None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = cost_per_token(
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
custom_llm_provider=custom_llm_provider,
response_time_ms=total_time,
region_name=region_name,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e:
raise e
def response_cost_calculator(
@ -47,7 +319,7 @@ def response_cost_calculator(
) -> Optional[float]:
try:
response_cost: float = 0.0
if cache_hit is not None and cache_hit == True:
if cache_hit is not None and cache_hit is True:
response_cost = 0.0
else:
response_object._hidden_params["optional_params"] = optional_params
@ -62,9 +334,11 @@ def response_cost_calculator(
if (
model in litellm.model_cost
and custom_pricing is not None
and custom_llm_provider == True
and custom_llm_provider is True
): # override defaults if custom pricing is set
base_model = model
elif base_model is None:
base_model = model
# base_model defaults to None if not set on model_info
response_cost = completion_cost(
completion_response=response_object,

View file

@ -20,7 +20,7 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
message,
llm_provider,
model,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -32,8 +32,14 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
self.response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
), # mock request object
)
super().__init__(
self.message, response=response, body=None
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
def __str__(self):
@ -60,7 +66,7 @@ class NotFoundError(openai.NotFoundError): # type: ignore
message,
model,
llm_provider,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -72,8 +78,14 @@ class NotFoundError(openai.NotFoundError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
self.response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
), # mock request object
)
super().__init__(
self.message, response=response, body=None
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
def __str__(self):
@ -262,7 +274,7 @@ class RateLimitError(openai.RateLimitError): # type: ignore
message,
llm_provider,
model,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -274,8 +286,18 @@ class RateLimitError(openai.RateLimitError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=429,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__(
self.message, response=response, body=None
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
def __str__(self):
@ -421,7 +443,7 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
message,
llm_provider,
model,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -433,8 +455,18 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__(
self.message, response=response, body=None
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
def __str__(self):
@ -460,7 +492,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore
message,
llm_provider,
model,
response: httpx.Response,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
@ -472,8 +504,18 @@ class InternalServerError(openai.InternalServerError): # type: ignore
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__(
self.message, response=response, body=None
self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs
def __str__(self):

View file

@ -366,8 +366,6 @@ class OpenTelemetry(CustomLogger):
)
message = choice.get("message")
if not isinstance(message, dict):
message = message.dict()
tool_calls = message.get("tool_calls")
if tool_calls:
span.set_attribute(

View file

View file

@ -3,6 +3,7 @@
from functools import partial
import os, types
import traceback
import json
from enum import Enum
import requests, copy # type: ignore
@ -242,12 +243,12 @@ class PredibaseChatCompletion(BaseLLM):
"details" in completion_response
and "tokens" in completion_response["details"]
):
model_response.choices[0].finish_reason = completion_response[
"details"
]["finish_reason"]
model_response.choices[0].finish_reason = map_finish_reason(
completion_response["details"]["finish_reason"]
)
sum_logprob = 0
for token in completion_response["details"]["tokens"]:
if token["logprob"] != None:
if token["logprob"] is not None:
sum_logprob += token["logprob"]
model_response["choices"][0][
"message"
@ -265,7 +266,7 @@ class PredibaseChatCompletion(BaseLLM):
):
sum_logprob = 0
for token in item["tokens"]:
if token["logprob"] != None:
if token["logprob"] is not None:
sum_logprob += token["logprob"]
if len(item["generated_text"]) > 0:
message_obj = Message(
@ -275,7 +276,7 @@ class PredibaseChatCompletion(BaseLLM):
else:
message_obj = Message(content=None)
choice_obj = Choices(
finish_reason=item["finish_reason"],
finish_reason=map_finish_reason(item["finish_reason"]),
index=idx + 1,
message=message_obj,
)
@ -285,10 +286,8 @@ class PredibaseChatCompletion(BaseLLM):
## CALCULATING USAGE
prompt_tokens = 0
try:
prompt_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"])
) ##[TODO] use a model-specific tokenizer here
except:
prompt_tokens = litellm.token_counter(messages=messages)
except Exception:
# this should remain non blocking we should not block a response returning if calculating usage fails
pass
output_text = model_response["choices"][0]["message"].get("content", "")
@ -331,6 +330,7 @@ class PredibaseChatCompletion(BaseLLM):
logging_obj,
optional_params: dict,
tenant_id: str,
timeout: Union[float, httpx.Timeout],
acompletion=None,
litellm_params=None,
logger_fn=None,
@ -340,6 +340,7 @@ class PredibaseChatCompletion(BaseLLM):
completion_url = ""
input_text = ""
base_url = "https://serving.app.predibase.com"
if "https" in model:
completion_url = model
elif api_base:
@ -349,7 +350,7 @@ class PredibaseChatCompletion(BaseLLM):
completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}"
if optional_params.get("stream", False) == True:
if optional_params.get("stream", False) is True:
completion_url += "/generate_stream"
else:
completion_url += "/generate"
@ -393,9 +394,9 @@ class PredibaseChatCompletion(BaseLLM):
},
)
## COMPLETION CALL
if acompletion == True:
if acompletion is True:
### ASYNC STREAMING
if stream == True:
if stream is True:
return self.async_streaming(
model=model,
messages=messages,
@ -410,6 +411,7 @@ class PredibaseChatCompletion(BaseLLM):
litellm_params=litellm_params,
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
) # type: ignore
else:
### ASYNC COMPLETION
@ -428,10 +430,11 @@ class PredibaseChatCompletion(BaseLLM):
litellm_params=litellm_params,
logger_fn=logger_fn,
headers=headers,
timeout=timeout,
) # type: ignore
### SYNC STREAMING
if stream == True:
if stream is True:
response = requests.post(
completion_url,
headers=headers,
@ -452,7 +455,6 @@ class PredibaseChatCompletion(BaseLLM):
headers=headers,
data=json.dumps(data),
)
return self.process_response(
model=model,
response=response,
@ -480,23 +482,26 @@ class PredibaseChatCompletion(BaseLLM):
stream,
data: dict,
optional_params: dict,
timeout: Union[float, httpx.Timeout],
litellm_params=None,
logger_fn=None,
headers={},
) -> ModelResponse:
self.async_handler = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
try:
response = await self.async_handler.post(
response = await async_handler.post(
api_base, headers=headers, data=json.dumps(data)
)
except httpx.HTTPStatusError as e:
raise PredibaseError(
status_code=e.response.status_code, message=e.response.text
status_code=e.response.status_code,
message="HTTPStatusError - {}".format(e.response.text),
)
except Exception as e:
raise PredibaseError(status_code=500, message=str(e))
raise PredibaseError(
status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
)
return self.process_response(
model=model,
response=response,
@ -522,6 +527,7 @@ class PredibaseChatCompletion(BaseLLM):
api_key,
logging_obj,
data: dict,
timeout: Union[float, httpx.Timeout],
optional_params=None,
litellm_params=None,
logger_fn=None,

View file

@ -432,9 +432,9 @@ def mock_completion(
if isinstance(mock_response, openai.APIError):
raise mock_response
raise litellm.APIError(
status_code=500, # type: ignore
message=str(mock_response),
llm_provider="openai", # type: ignore
status_code=getattr(mock_response, "status_code", 500), # type: ignore
message=getattr(mock_response, "text", str(mock_response)),
llm_provider=getattr(mock_response, "llm_provider", "openai"), # type: ignore
model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
)
@ -1949,7 +1949,8 @@ def completion(
)
api_base = (
optional_params.pop("api_base", None)
api_base
or optional_params.pop("api_base", None)
or optional_params.pop("base_url", None)
or litellm.api_base
or get_secret("PREDIBASE_API_BASE")
@ -1977,12 +1978,13 @@ def completion(
custom_prompt_dict=custom_prompt_dict,
api_key=api_key,
tenant_id=tenant_id,
timeout=timeout,
)
if (
"stream" in optional_params
and optional_params["stream"] == True
and acompletion == False
and optional_params["stream"] is True
and acompletion is False
):
return _model_response
response = _model_response

View file

@ -3009,32 +3009,37 @@
"litellm_provider": "sagemaker",
"mode": "chat"
},
"together-ai-up-to-3b": {
"together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai"
},
"together-ai-3.1b-7b": {
"together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai"
},
"together-ai-7.1b-20b": {
"together-ai-8.1b-21b": {
"max_tokens": 1000,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000004,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai"
},
"together-ai-20.1b-40b": {
"together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai"
},
"together-ai-40.1b-70b": {
"together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai"
},
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006,

View file

@ -1,7 +1,12 @@
import json
import logging
from logging import Formatter
import sys
import os
from litellm import json_logs
# Set default log level to INFO
log_level = os.getenv("LITELLM_LOG", "INFO")
numeric_level: str = getattr(logging, log_level.upper())
class JsonFormatter(Formatter):
@ -16,6 +21,14 @@ class JsonFormatter(Formatter):
logger = logging.root
handler = logging.StreamHandler()
if json_logs:
handler.setFormatter(JsonFormatter())
else:
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
datefmt="%H:%M:%S",
)
handler.setFormatter(formatter)
logger.handlers = [handler]
logger.setLevel(logging.INFO)
logger.setLevel(numeric_level)

View file

@ -8,6 +8,17 @@ model_list:
- model_name: llama3-70b-8192
litellm_params:
model: groq/llama3-70b-8192
- model_name: fake-openai-endpoint
litellm_params:
model: predibase/llama-3-8b-instruct
api_base: "http://0.0.0.0:8081"
api_key: os.environ/PREDIBASE_API_KEY
tenant_id: os.environ/PREDIBASE_TENANT_ID
max_retries: 0
temperature: 0.1
max_new_tokens: 256
return_full_text: false
# - litellm_params:
# api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
# api_key: os.environ/AZURE_EUROPE_API_KEY
@ -57,6 +68,8 @@ router_settings:
litellm_settings:
success_callback: ["langfuse"]
cache: True
failure_callback: ["langfuse"]
general_settings:
alerting: ["email"]

View file

@ -160,6 +160,7 @@ from litellm.proxy.auth.auth_checks import (
get_user_object,
allowed_routes_check,
get_actual_routes,
log_to_opentelemetry,
)
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from litellm.exceptions import RejectedRequestError
@ -368,6 +369,11 @@ from typing import Dict
api_key_header = APIKeyHeader(
name="Authorization", auto_error=False, description="Bearer token"
)
azure_api_key_header = APIKeyHeader(
name="API-Key",
auto_error=False,
description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
)
user_api_base = None
user_model = None
user_debug = False
@ -508,13 +514,19 @@ async def check_request_disconnection(request: Request, llm_api_call_task):
async def user_api_key_auth(
request: Request, api_key: str = fastapi.Security(api_key_header)
request: Request,
api_key: str = fastapi.Security(api_key_header),
azure_api_key_header: str = fastapi.Security(azure_api_key_header),
) -> UserAPIKeyAuth:
global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj
try:
if isinstance(api_key, str):
passed_in_key = api_key
api_key = _get_bearer_token(api_key=api_key)
elif isinstance(azure_api_key_header, str):
api_key = azure_api_key_header
parent_otel_span: Optional[Span] = None
if open_telemetry_logger is not None:
parent_otel_span = open_telemetry_logger.tracer.start_span(
@ -1495,7 +1507,7 @@ async def user_api_key_auth(
)
if valid_token is None:
# No token was found when looking up in the DB
raise Exception("Invalid token passed")
raise Exception("Invalid proxy server token passed")
if valid_token_dict is not None:
if user_id_information is not None and _is_user_proxy_admin(
user_id_information
@ -1528,6 +1540,14 @@ async def user_api_key_auth(
str(e)
)
)
# Log this exception to OTEL
if open_telemetry_logger is not None:
await open_telemetry_logger.async_post_call_failure_hook(
original_exception=e,
user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, litellm.BudgetExceededError):
raise ProxyException(
@ -7803,6 +7823,10 @@ async def get_global_spend_report(
default=None,
description="Time till which to view spend",
),
group_by: Optional[Literal["team", "customer"]] = fastapi.Query(
default="team",
description="Group spend by internal team or customer",
),
):
"""
Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model
@ -7849,6 +7873,7 @@ async def get_global_spend_report(
f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
)
if group_by == "team":
# first get data from spend logs -> SpendByModelApiKey
# then read data from "SpendByModelApiKey" to format the response obj
sql_query = """
@ -7913,6 +7938,66 @@ async def get_global_spend_report(
return db_response
elif group_by == "customer":
sql_query = """
WITH SpendByModelApiKey AS (
SELECT
date_trunc('day', sl."startTime") AS group_by_day,
sl.end_user AS customer,
sl.model,
sl.api_key,
SUM(sl.spend) AS model_api_spend,
SUM(sl.total_tokens) AS model_api_tokens
FROM
"LiteLLM_SpendLogs" sl
WHERE
sl."startTime" BETWEEN $1::date AND $2::date
GROUP BY
date_trunc('day', sl."startTime"),
customer,
sl.model,
sl.api_key
)
SELECT
group_by_day,
jsonb_agg(jsonb_build_object(
'customer', customer,
'total_spend', total_spend,
'metadata', metadata
)) AS customers
FROM
(
SELECT
group_by_day,
customer,
SUM(model_api_spend) AS total_spend,
jsonb_agg(jsonb_build_object(
'model', model,
'api_key', api_key,
'spend', model_api_spend,
'total_tokens', model_api_tokens
)) AS metadata
FROM
SpendByModelApiKey
GROUP BY
group_by_day,
customer
) AS aggregated
GROUP BY
group_by_day
ORDER BY
group_by_day;
"""
db_response = await prisma_client.db.query_raw(
sql_query, start_date_obj, end_date_obj
)
if db_response is None:
return []
return db_response
except Exception as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,

View file

@ -2056,6 +2056,9 @@ class Router:
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
generic_fallback_idx: Optional[int] = None
## check for specific model group-specific fallbacks
if isinstance(fallbacks, list):
fallback_model_group = fallbacks
elif isinstance(fallbacks, dict):
for idx, item in enumerate(fallbacks):
if list(item.keys())[0] == model_group:
fallback_model_group = item[model_group]
@ -2310,6 +2313,9 @@ class Router:
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
fallback_model_group = None
generic_fallback_idx: Optional[int] = None
if isinstance(fallbacks, list):
fallback_model_group = fallbacks
elif isinstance(fallbacks, dict):
## check for specific model group-specific fallbacks
for idx, item in enumerate(fallbacks):
if list(item.keys())[0] == model_group:

View file

@ -345,7 +345,7 @@ def test_completion_claude_3_function_call(model):
drop_params=True,
)
# Add any assertions, here to check response args
# Add any assertions here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
@ -530,6 +530,7 @@ def test_completion_cohere_command_r_plus_function_call():
messages=messages,
tools=tools,
tool_choice="auto",
force_single_step=True,
)
print(second_response)
except Exception as e:

View file

@ -517,3 +517,51 @@ def test_groq_response_cost_tracking(is_streaming):
assert response_cost > 0.0
print(f"response_cost: {response_cost}")
def test_together_ai_qwen_completion_cost():
input_kwargs = {
"completion_response": litellm.ModelResponse(
**{
"id": "890db0c33c4ef94b-SJC",
"choices": [
{
"finish_reason": "eos",
"index": 0,
"message": {
"content": "I am Qwen, a large language model created by Alibaba Cloud.",
"role": "assistant",
},
}
],
"created": 1717900130,
"model": "together_ai/qwen/Qwen2-72B-Instruct",
"object": "chat.completion",
"system_fingerprint": None,
"usage": {
"completion_tokens": 15,
"prompt_tokens": 23,
"total_tokens": 38,
},
}
),
"model": "qwen/Qwen2-72B-Instruct",
"prompt": "",
"messages": [],
"completion": "",
"total_time": 0.0,
"call_type": "completion",
"custom_llm_provider": "together_ai",
"region_name": None,
"size": None,
"quality": None,
"n": None,
"custom_cost_per_token": None,
"custom_cost_per_second": None,
}
response = litellm.cost_calculator.get_model_params_and_category(
model_name="qwen/Qwen2-72B-Instruct"
)
assert response == "together-ai-41.1b-80b"

View file

@ -3,6 +3,7 @@ import os
import sys
import traceback
import subprocess, asyncio
from typing import Any
sys.path.insert(
0, os.path.abspath("../..")
@ -19,6 +20,7 @@ from litellm import (
)
from concurrent.futures import ThreadPoolExecutor
import pytest
from unittest.mock import patch, MagicMock
litellm.vertex_project = "pathrise-convert-1606954137718"
litellm.vertex_location = "us-central1"
@ -655,3 +657,47 @@ def test_litellm_predibase_exception():
# accuracy_score = counts[True]/(counts[True] + counts[False])
# print(f"accuracy_score: {accuracy_score}")
@pytest.mark.parametrize("provider", ["predibase"])
def test_exception_mapping(provider):
"""
For predibase, run through a set of mock exceptions
assert that they are being mapped correctly
"""
litellm.set_verbose = True
error_map = {
400: litellm.BadRequestError,
401: litellm.AuthenticationError,
404: litellm.NotFoundError,
408: litellm.Timeout,
429: litellm.RateLimitError,
500: litellm.InternalServerError,
503: litellm.ServiceUnavailableError,
}
for code, expected_exception in error_map.items():
mock_response = Exception()
setattr(mock_response, "text", "This is an error message")
setattr(mock_response, "llm_provider", provider)
setattr(mock_response, "status_code", code)
response: Any = None
try:
response = completion(
model="{}/test-model".format(provider),
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response=mock_response,
)
except expected_exception:
continue
except Exception as e:
response = "{}\n{}".format(str(e), traceback.format_exc())
pytest.fail(
"Did not raise expected exception. Expected={}, Return={},".format(
expected_exception, response
)
)
pass

View file

@ -272,7 +272,7 @@ def test_call_with_invalid_key(prisma_client):
except Exception as e:
print("Got Exception", e)
print(e.message)
assert "Authentication Error, Invalid token passed" in e.message
assert "Authentication Error, Invalid proxy server token passed" in e.message
pass

View file

@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks):
assert isinstance(response, litellm.ModelResponse)
assert response.model is not None and response.model == "gpt-4o"
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_client_side_fallbacks_list(sync_mode):
"""
Tests Client Side Fallbacks
User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work
"""
router = Router(
model_list=[
{
"model_name": "bad-model",
"litellm_params": {
"model": "openai/my-bad-model",
"api_key": "my-bad-api-key",
},
},
{
"model_name": "my-good-model",
"litellm_params": {
"model": "gpt-4o",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
],
)
if sync_mode:
response = router.completion(
model="bad-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
fallbacks=["my-good-model"],
mock_testing_fallbacks=True,
mock_response="Hey! nice day",
)
else:
response = await router.acompletion(
model="bad-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
fallbacks=["my-good-model"],
mock_testing_fallbacks=True,
mock_response="Hey! nice day",
)
assert isinstance(response, litellm.ModelResponse)
assert response.model is not None and response.model == "gpt-4o"

View file

@ -326,6 +326,22 @@ class Function(OpenAIObject):
super(Function, self).__init__(**data)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class ChatCompletionDeltaToolCall(OpenAIObject):
id: Optional[str] = None
@ -385,6 +401,22 @@ class ChatCompletionMessageToolCall(OpenAIObject):
else:
self.type = "function"
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class Message(OpenAIObject):
def __init__(
@ -3929,54 +3961,6 @@ def client(original_function):
return wrapper
####### USAGE CALCULATOR ################
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name):
import re
model_name = model_name.lower()
params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if params_match != None:
params_match = params_match.group(1)
params_match = params_match.replace("b", "")
params_billion = float(params_match)
# Determine the category based on the number of parameters
if params_billion <= 3.0:
category = "together-ai-up-to-3b"
elif params_billion <= 7.0:
category = "together-ai-3.1b-7b"
elif params_billion <= 20.0:
category = "together-ai-7.1b-20b"
elif params_billion <= 40.0:
category = "together-ai-20.1b-40b"
elif params_billion <= 70.0:
category = "together-ai-40.1b-70b"
return category
return None
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
# see https://replicate.com/pricing
a100_40gb_price_per_second_public = 0.001150
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = getattr(completion_response, "ended", time.time())
total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time / 1000
@lru_cache(maxsize=128)
def _select_tokenizer(model: str):
if model in litellm.cohere_models and "command-r" in model:
@ -4363,7 +4347,7 @@ def _cost_per_token_custom_pricing_helper(
def cost_per_token(
model="",
model: str = "",
prompt_tokens=0,
completion_tokens=0,
response_time_ms=None,
@ -4388,6 +4372,8 @@ def cost_per_token(
Returns:
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
"""
if model is None:
raise Exception("Invalid arg. Model cannot be none.")
## CUSTOM PRICING ##
response_cost = _cost_per_token_custom_pricing_helper(
prompt_tokens=prompt_tokens,
@ -4560,213 +4546,6 @@ def cost_per_token(
)
def completion_cost(
completion_response=None,
model=None,
prompt="",
messages: List = [],
completion="",
total_time=0.0, # used for replicate, sagemaker
call_type: Literal[
"embedding",
"aembedding",
"completion",
"acompletion",
"atext_completion",
"text_completion",
"image_generation",
"aimage_generation",
"moderation",
"amoderation",
"atranscription",
"transcription",
"aspeech",
"speech",
] = "completion",
### REGION ###
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size=None,
quality=None,
n=None, # number of images
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
Parameters:
completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
[OPTIONAL PARAMS]
model (str): Optional. The name of the language model used in the completion calls
prompt (str): Optional. The input prompt passed to the llm
completion (str): Optional. The output completion text from the llm
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
Returns:
float: The cost in USD dollars for the completion based on the provided parameters.
Exceptions:
Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
Note:
- If completion_response is provided, the function extracts token information and the model name from it.
- If completion_response is not provided, the function calculates token counts based on the model and input text.
- The cost is calculated based on the model, prompt tokens, and completion tokens.
- For certain models containing "togethercomputer" in the name, prices are based on the model size.
- For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
"""
try:
if (
(call_type == "aimage_generation" or call_type == "image_generation")
and model is not None
and isinstance(model, str)
and len(model) == 0
and custom_llm_provider == "azure"
):
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
completion_tokens = 0
custom_llm_provider = None
if completion_response is not None:
# get input/output tokens from completion_response
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = model or completion_response.get(
"model", None
) # check if user passed an override for model, if it's none check completion_response['model']
if hasattr(completion_response, "_hidden_params"):
if (
completion_response._hidden_params.get("model", None) is not None
and len(completion_response._hidden_params["model"]) > 0
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
size = completion_response._hidden_params.get(
"optional_params", {}
).get(
"size", "1024-x-1024"
) # openai default
quality = completion_response._hidden_params.get(
"optional_params", {}
).get(
"quality", "standard"
) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get(
"n", 1
) # openai default
else:
if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages)
elif len(prompt) > 0:
prompt_tokens = token_counter(model=model, text=prompt)
completion_tokens = token_counter(model=model, text=completion)
if model == None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if (
call_type == CallTypes.image_generation.value
or call_type == CallTypes.aimage_generation.value
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
# https://cloud.google.com/vertex-ai/generative-ai/pricing
# Vertex Charges Flat $0.20 per image
return 0.020
# fix size to match naming convention
if "x" in size and "-x-" not in size:
size = size.replace("x", "-x-")
image_gen_model_name = f"{size}/{model}"
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size = size.split("-x-")
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(
f"Model={image_gen_model_name} not found in completion cost model map"
)
# Calculate cost based on prompt_tokens, completion_tokens
if (
"togethercomputer" in model
or "together_ai" in model
or custom_llm_provider == "together_ai"
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = cost_per_token(
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
custom_llm_provider=custom_llm_provider,
response_time_ms=total_time,
region_name=region_name,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e:
raise e
def supports_httpx_timeout(custom_llm_provider: str) -> bool:
"""
Helper function to know if a provider implementation supports httpx timeout
@ -8986,6 +8765,75 @@ def exception_type(
response=original_exception.response,
litellm_debug_info=extra_information,
)
elif hasattr(original_exception, "status_code"):
if original_exception.status_code == 500:
exception_mapping_worked = True
raise litellm.InternalServerError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 401:
exception_mapping_worked = True
raise AuthenticationError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 400:
exception_mapping_worked = True
raise BadRequestError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 404:
exception_mapping_worked = True
raise NotFoundError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
raise Timeout(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 422:
exception_mapping_worked = True
raise BadRequestError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
raise RateLimitError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 503:
exception_mapping_worked = True
raise ServiceUnavailableError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 504: # gateway timeout error
exception_mapping_worked = True
raise Timeout(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif custom_llm_provider == "bedrock":
if (
"too many tokens" in error_str

View file

@ -3009,32 +3009,37 @@
"litellm_provider": "sagemaker",
"mode": "chat"
},
"together-ai-up-to-3b": {
"together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai"
},
"together-ai-3.1b-7b": {
"together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai"
},
"together-ai-7.1b-20b": {
"together-ai-8.1b-21b": {
"max_tokens": 1000,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000004,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai"
},
"together-ai-20.1b-40b": {
"together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai"
},
"together-ai-40.1b-70b": {
"together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai"
},
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006,

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.40.7"
version = "1.40.8"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -84,7 +84,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.40.7"
version = "1.40.8"
version_files = [
"pyproject.toml:^version"
]

View file

@ -1,3 +1,3 @@
ignore = ["F405"]
ignore = ["F405", "E402"]
extend-select = ["E501"]
line-length = 120

View file

@ -120,8 +120,23 @@ const ChatUI: React.FC<ChatUIProps> = ({
// Now, 'options' contains the list you wanted
console.log(options); // You can log it to verify the list
// if options.length > 0, only store unique values
if (options.length > 0) {
const uniqueModels = Array.from(new Set(options));
console.log("Unique models:", uniqueModels);
// sort uniqueModels alphabetically
uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label));
console.log("Model info:", modelInfo);
// setModelInfo(options) should be inside the if block to avoid setting it when no data is available
setModelInfo(options);
setModelInfo(uniqueModels);
}
setSelectedModel(fetchedAvailableModels.data[0].id);
}
} catch (error) {

View file

@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
setSelectedAPIKey(key);
}}
>
{key["key_alias"]} (Enterpise only Feature)
{key["key_alias"]} (Enterprise only Feature)
</SelectItem>
);
}
@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
setSelectedCustomer(user);
}}
>
{user} (Enterpise only Feature)
{user} (Enterprise only Feature)
</SelectItem>
);
})

View file

@ -114,7 +114,7 @@ const Navbar: React.FC<NavbarProps> = ({
textDecoration: "underline",
}}
>
Get enterpise license
Get enterprise license
</a>
</div>
) : null}

View file

@ -832,7 +832,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
// @ts-ignore
disabled={true}
>
{tag} (Enterpise only Feature)
{tag} (Enterprise only Feature)
</SelectItem>
);
})}