Merge branch 'main' into litellm_aws_kms_fixes

This commit is contained in:
Krish Dholakia 2024-06-10 20:17:34 -07:00 committed by GitHub
commit 4475d2e5b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 1293 additions and 483 deletions

View file

@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml
```bash ```bash
curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \ curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \ -H "Authorization: Bearer sk-1234"
``` ```
**Create a Thread** **Create a Thread**
@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \
-d '' -d ''
``` ```
**Get a Thread**
```bash
curl http://0.0.0.0:4000/v1/threads/{thread_id} \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234"
```
**Add Messages to the Thread** **Add Messages to the Thread**
```bash ```bash

View file

@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t
</TabItem> </TabItem>
</Tabs>
## Switch Cache On / Off Per LiteLLM Call
LiteLLM supports 4 cache-controls:
- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint.
- `no-store`: *Optional(bool)* When `True`, Will not cache the response.
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
<Tabs>
<TabItem value="no-cache" label="No-Cache">
Example usage `no-cache` - When `True`, Will not return a cached response
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"no-cache": True},
)
```
</TabItem>
<TabItem value="no-store" label="No-Store">
Example usage `no-store` - When `True`, Will not cache the response.
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"no-store": True},
)
```
</TabItem>
<TabItem value="ttl" label="ttl">
Example usage `ttl` - cache the response for 10 seconds
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"ttl": 10},
)
```
</TabItem>
<TabItem value="s-maxage" label="s-maxage">
Example usage `s-maxage` - Will only accept cached responses for 60 seconds
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "hello who are you"
}
],
cache={"s-maxage": 60},
)
```
</TabItem>
</Tabs> </Tabs>
## Cache Context Manager - Enable, Disable, Update Cache ## Cache Context Manager - Enable, Disable, Update Cache

View file

@ -1,3 +1,5 @@
# llmcord.py
llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted. llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted.
Github: https://github.com/jakobdylanc/discord-llm-chatbot Github: https://github.com/jakobdylanc/discord-llm-chatbot

View file

@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
<Image img={require('../../img/admin_ui_spend.png')} /> <Image img={require('../../img/admin_ui_spend.png')} />
## API Endpoints to get Spend ## API Endpoints to get Spend
#### Getting Spend Reports - To Charge Other Teams, API Keys #### Getting Spend Reports - To Charge Other Teams, Customers
Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model Use the `/global/spend/report` endpoint to get daily spend report per
- team
- customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm)
<Tabs>
<TabItem value="per team" label="Spend Per Team">
##### Example Request ##### Example Request
👉 Key Change: Specify `group_by=team`
```shell ```shell
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \
-H 'Authorization: Bearer sk-1234' -H 'Authorization: Bearer sk-1234'
``` ```
@ -254,6 +262,69 @@ Output from script
``` ```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="per customer" label="Spend Per Customer">
##### Example Request
👉 Key Change: Specify `group_by=customer`
```shell
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \
-H 'Authorization: Bearer sk-1234'
```
##### Example Response
```shell
[
{
"group_by_day": "2024-04-30T00:00:00+00:00",
"customers": [
{
"customer": "palantir",
"total_spend": 0.0015265,
"metadata": [ # see the spend by unique(key + model)
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "88dc28.." # the hashed api key
},
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "a73dc2.." # the hashed api key
},
{
"model": "chatgpt-v-2",
"spend": 0.000214,
"total_tokens": 122,
"api_key": "898c28.." # the hashed api key
},
{
"model": "gpt-3.5-turbo",
"spend": 0.0000825,
"total_tokens": 85,
"api_key": "84dc28.." # the hashed api key
}
]
}
]
}
]
```
</TabItem> </TabItem>
</Tabs> </Tabs>

View file

@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env:
```bash ```bash
export JSON_LOGS="True" export JSON_LOGS="True"
``` ```
**OR**
Set `json_logs: true` in your yaml:
```yaml
litellm_settings:
json_logs: true
```
Start proxy Start proxy
@ -49,4 +57,35 @@ Start proxy
$ litellm $ litellm
``` ```
The proxy will now all logs in json format. The proxy will now all logs in json format.
## Control Log Output
Turn off fastapi's default 'INFO' logs
1. Turn on 'json logs'
```yaml
litellm_settings:
json_logs: true
```
2. Set `LITELLM_LOG` to 'ERROR'
Only get logs if an error occurs.
```bash
LITELLM_LOG="ERROR"
```
3. Start proxy
```bash
$ litellm
```
Expected Output:
```bash
# no info statements
```

View file

@ -2,11 +2,21 @@
Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform). Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).
:::info
Requires Enterprise License for usage.
:::
## Set `LiteLLM-Changed-By` in request headers :::tip
Requires Enterprise License, Get in touch with us [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## 1. Switch on audit Logs
Add `store_audit_logs` to your litellm config.yaml and then start the proxy
```shell
litellm_settings:
store_audit_logs: true
```
## 2. Set `LiteLLM-Changed-By` in request headers
Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management). Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management).
@ -26,7 +36,7 @@ curl -X POST 'http://0.0.0.0:4000/team/update' \
}' }'
``` ```
## Emitted Audit Log ## 3. Emitted Audit Log
```bash ```bash
{ {

View file

@ -21,6 +21,7 @@ general_settings:
litellm_settings: litellm_settings:
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
json_logs: true # Get debug logs in json format
``` ```
Set slack webhook url in your env Set slack webhook url in your env
@ -28,6 +29,11 @@ Set slack webhook url in your env
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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" export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
``` ```
Turn off FASTAPI's default info logs
```bash
export LITELLM_LOG="ERROR"
```
:::info :::info
Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)

View file

@ -2,18 +2,13 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# 🔥 Fallbacks, Retries, Timeouts, Load Balancing # 🔥 Load Balancing, Fallbacks, Retries, Timeouts
Retry call with multiple instances of the same model. - Quick Start [load balancing](#test---load-balancing)
- Quick Start [client side fallbacks](#test---client-side-fallbacks)
If a call fails after num_retries, fall back to another model group.
If the error is a context window exceeded error, fall back to a larger model group (if given).
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
## Quick Start - Load Balancing ## Quick Start - Load Balancing
### Step 1 - Set deployments on config #### Step 1 - Set deployments on config
**Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo` **Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
```yaml ```yaml
@ -38,50 +33,220 @@ model_list:
rpm: 1440 rpm: 1440
``` ```
### Step 2: Start Proxy with config #### Step 2: Start Proxy with config
```shell ```shell
$ litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
### Step 3: Use proxy - Call a model group [Load Balancing] ### Test - Load Balancing
Curl Command
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
👉 Key Change: `model="gpt-3.5-turbo"`
**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
]
)
print(response)
```
</TabItem>
<TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body
```shell ```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \ curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data ' { --data '{
"model": "gpt-3.5-turbo", "model": "gpt-3.5-turbo",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
"content": "what llm are you" "content": "what llm are you"
} }
], ]
} }'
' ```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model="gpt-3.5-turbo",
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
``` ```
### Usage - Call a specific model deployment </TabItem>
If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model`
</Tabs>
### Test - Client Side Fallbacks
In this request the following will occur:
1. The request to `model="zephyr-beta"` will fail
2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo
👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(
model="zephyr-beta",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
}
)
print(response)
```
</TabItem>
<TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "zephyr-beta"",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
}'
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model="zephyr-beta",
extra_body={
"metadata": {
"fallbacks": ["gpt-3.5-turbo"]
}
}
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
<!--
### Test it!
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
```bash ```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \ curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--data ' { --data-raw '{
"model": "azure/gpt-turbo-small-ca", "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
"messages": [ "messages": [
{ {"role": "user", "content": "what color is red"}
"role": "user", ],
"content": "what llm are you" "mock_testing_fallbacks": true
} }'
], ``` -->
}
'
```
## Fallbacks + Retries + Timeouts + Cooldowns ## Advanced
### Fallbacks + Retries + Timeouts + Cooldowns
**Set via config** **Set via config**
```yaml ```yaml
@ -114,44 +279,7 @@ litellm_settings:
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
``` ```
### Context Window Fallbacks (Pre-Call Checks + Fallbacks)
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
"context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
"num_retries": 2,
"timeout": 10
}
'
```
### Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
"messages": [
{"role": "user", "content": "what color is red"}
],
"mock_testing_fallbacks": true
}'
```
## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. **Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
@ -287,7 +415,7 @@ print(response)
</Tabs> </Tabs>
## Advanced - EU-Region Filtering (Pre-Call Checks) ### EU-Region Filtering (Pre-Call Checks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**. **Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
@ -350,7 +478,7 @@ print(response)
print(f"response.headers.get('x-litellm-model-api-base')") print(f"response.headers.get('x-litellm-model-api-base')")
``` ```
## Advanced - Custom Timeouts, Stream Timeouts - Per Model ### Custom Timeouts, Stream Timeouts - Per Model
For each model you can set `timeout` & `stream_timeout` under `litellm_params` For each model you can set `timeout` & `stream_timeout` under `litellm_params`
```yaml ```yaml
model_list: model_list:
@ -379,7 +507,7 @@ $ litellm --config /path/to/config.yaml
``` ```
## Advanced - Setting Dynamic Timeouts - Per Request ### Setting Dynamic Timeouts - Per Request
LiteLLM Proxy supports setting a `timeout` per request LiteLLM Proxy supports setting a `timeout` per request

View file

@ -255,6 +255,7 @@ const sidebars = {
"projects/GPT Migrate", "projects/GPT Migrate",
"projects/YiVal", "projects/YiVal",
"projects/LiteLLM Proxy", "projects/LiteLLM Proxy",
"projects/llm_cord",
], ],
}, },
], ],

View file

@ -709,6 +709,7 @@ all_embedding_models = (
openai_image_generation_models = ["dall-e-2", "dall-e-3"] openai_image_generation_models = ["dall-e-2", "dall-e-3"]
from .timeout import timeout from .timeout import timeout
from .cost_calculator import completion_cost
from .utils import ( from .utils import (
client, client,
exception_type, exception_type,
@ -718,7 +719,6 @@ from .utils import (
create_pretrained_tokenizer, create_pretrained_tokenizer,
create_tokenizer, create_tokenizer,
cost_per_token, cost_per_token,
completion_cost,
supports_function_calling, supports_function_calling,
supports_parallel_function_calling, supports_parallel_function_calling,
supports_vision, supports_vision,

View file

@ -1,6 +1,7 @@
# What is this? # What is this?
## File for 'response_cost' calculation in Logging ## File for 'response_cost' calculation in Logging
from typing import Optional, Union, Literal from typing import Optional, Union, Literal, List
import litellm._logging
from litellm.utils import ( from litellm.utils import (
ModelResponse, ModelResponse,
EmbeddingResponse, EmbeddingResponse,
@ -8,10 +9,281 @@ from litellm.utils import (
TranscriptionResponse, TranscriptionResponse,
TextCompletionResponse, TextCompletionResponse,
CallTypes, CallTypes,
completion_cost, cost_per_token,
print_verbose, print_verbose,
CostPerToken,
token_counter,
) )
import litellm import litellm
from litellm import verbose_logger
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name) -> str:
"""
Helper function for calculating together ai pricing.
Returns
- str - model pricing category if mapped else received model name
"""
import re
model_name = model_name.lower()
re_params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
params_match = params_match.replace("b", "")
if params_match is not None:
params_billion = float(params_match)
else:
return model_name
# Determine the category based on the number of parameters
if params_billion <= 4.0:
category = "together-ai-up-to-4b"
elif params_billion <= 8.0:
category = "together-ai-4.1b-8b"
elif params_billion <= 21.0:
category = "together-ai-8.1b-21b"
elif params_billion <= 41.0:
category = "together-ai-21.1b-41b"
elif params_billion <= 80.0:
category = "together-ai-41.1b-80b"
elif params_billion <= 110.0:
category = "together-ai-81.1b-110b"
if category is not None:
return category
return model_name
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
# see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = getattr(completion_response, "ended", time.time())
total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time / 1000
def completion_cost(
completion_response=None,
model: Optional[str] = None,
prompt="",
messages: List = [],
completion="",
total_time=0.0, # used for replicate, sagemaker
call_type: Literal[
"embedding",
"aembedding",
"completion",
"acompletion",
"atext_completion",
"text_completion",
"image_generation",
"aimage_generation",
"moderation",
"amoderation",
"atranscription",
"transcription",
"aspeech",
"speech",
] = "completion",
### REGION ###
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size=None,
quality=None,
n=None, # number of images
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
Parameters:
completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
[OPTIONAL PARAMS]
model (str): Optional. The name of the language model used in the completion calls
prompt (str): Optional. The input prompt passed to the llm
completion (str): Optional. The output completion text from the llm
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
Returns:
float: The cost in USD dollars for the completion based on the provided parameters.
Exceptions:
Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
Note:
- If completion_response is provided, the function extracts token information and the model name from it.
- If completion_response is not provided, the function calculates token counts based on the model and input text.
- The cost is calculated based on the model, prompt tokens, and completion tokens.
- For certain models containing "togethercomputer" in the name, prices are based on the model size.
- For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
"""
try:
if (
(call_type == "aimage_generation" or call_type == "image_generation")
and model is not None
and isinstance(model, str)
and len(model) == 0
and custom_llm_provider == "azure"
):
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
completion_tokens = 0
custom_llm_provider = None
if completion_response is not None:
# get input/output tokens from completion_response
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = model or completion_response.get(
"model", None
) # check if user passed an override for model, if it's none check completion_response['model']
if hasattr(completion_response, "_hidden_params"):
if (
completion_response._hidden_params.get("model", None) is not None
and len(completion_response._hidden_params["model"]) > 0
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
size = completion_response._hidden_params.get(
"optional_params", {}
).get(
"size", "1024-x-1024"
) # openai default
quality = completion_response._hidden_params.get(
"optional_params", {}
).get(
"quality", "standard"
) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get(
"n", 1
) # openai default
else:
if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages)
elif len(prompt) > 0:
prompt_tokens = token_counter(model=model, text=prompt)
completion_tokens = token_counter(model=model, text=completion)
if model is None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if (
call_type == CallTypes.image_generation.value
or call_type == CallTypes.aimage_generation.value
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
# https://cloud.google.com/vertex-ai/generative-ai/pricing
# Vertex Charges Flat $0.20 per image
return 0.020
# fix size to match naming convention
if "x" in size and "-x-" not in size:
size = size.replace("x", "-x-")
image_gen_model_name = f"{size}/{model}"
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size = size.split("-x-")
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(
f"Model={image_gen_model_name} not found in completion cost model map"
)
# Calculate cost based on prompt_tokens, completion_tokens
if (
"togethercomputer" in model
or "together_ai" in model
or custom_llm_provider == "together_ai"
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
if model is None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = cost_per_token(
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
custom_llm_provider=custom_llm_provider,
response_time_ms=total_time,
region_name=region_name,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e:
raise e
def response_cost_calculator( def response_cost_calculator(
@ -47,7 +319,7 @@ def response_cost_calculator(
) -> Optional[float]: ) -> Optional[float]:
try: try:
response_cost: float = 0.0 response_cost: float = 0.0
if cache_hit is not None and cache_hit == True: if cache_hit is not None and cache_hit is True:
response_cost = 0.0 response_cost = 0.0
else: else:
response_object._hidden_params["optional_params"] = optional_params response_object._hidden_params["optional_params"] = optional_params
@ -62,9 +334,11 @@ def response_cost_calculator(
if ( if (
model in litellm.model_cost model in litellm.model_cost
and custom_pricing is not None and custom_pricing is not None
and custom_llm_provider == True and custom_llm_provider is True
): # override defaults if custom pricing is set ): # override defaults if custom pricing is set
base_model = model base_model = model
elif base_model is None:
base_model = model
# base_model defaults to None if not set on model_info # base_model defaults to None if not set on model_info
response_cost = completion_cost( response_cost = completion_cost(
completion_response=response_object, completion_response=response_object,

View file

@ -20,7 +20,7 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
message, message,
llm_provider, llm_provider,
model, model,
response: httpx.Response, response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None, litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None, max_retries: Optional[int] = None,
num_retries: Optional[int] = None, num_retries: Optional[int] = None,
@ -32,8 +32,14 @@ class AuthenticationError(openai.AuthenticationError): # type: ignore
self.litellm_debug_info = litellm_debug_info self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries self.max_retries = max_retries
self.num_retries = num_retries self.num_retries = num_retries
self.response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
), # mock request object
)
super().__init__( super().__init__(
self.message, response=response, body=None self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
def __str__(self): def __str__(self):
@ -60,7 +66,7 @@ class NotFoundError(openai.NotFoundError): # type: ignore
message, message,
model, model,
llm_provider, llm_provider,
response: httpx.Response, response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None, litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None, max_retries: Optional[int] = None,
num_retries: Optional[int] = None, num_retries: Optional[int] = None,
@ -72,8 +78,14 @@ class NotFoundError(openai.NotFoundError): # type: ignore
self.litellm_debug_info = litellm_debug_info self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries self.max_retries = max_retries
self.num_retries = num_retries self.num_retries = num_retries
self.response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
), # mock request object
)
super().__init__( super().__init__(
self.message, response=response, body=None self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
def __str__(self): def __str__(self):
@ -262,7 +274,7 @@ class RateLimitError(openai.RateLimitError): # type: ignore
message, message,
llm_provider, llm_provider,
model, model,
response: httpx.Response, response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None, litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None, max_retries: Optional[int] = None,
num_retries: Optional[int] = None, num_retries: Optional[int] = None,
@ -274,8 +286,18 @@ class RateLimitError(openai.RateLimitError): # type: ignore
self.litellm_debug_info = litellm_debug_info self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries self.max_retries = max_retries
self.num_retries = num_retries self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=429,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__( super().__init__(
self.message, response=response, body=None self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
def __str__(self): def __str__(self):
@ -421,7 +443,7 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
message, message,
llm_provider, llm_provider,
model, model,
response: httpx.Response, response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None, litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None, max_retries: Optional[int] = None,
num_retries: Optional[int] = None, num_retries: Optional[int] = None,
@ -433,8 +455,18 @@ class ServiceUnavailableError(openai.APIStatusError): # type: ignore
self.litellm_debug_info = litellm_debug_info self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries self.max_retries = max_retries
self.num_retries = num_retries self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__( super().__init__(
self.message, response=response, body=None self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
def __str__(self): def __str__(self):
@ -460,7 +492,7 @@ class InternalServerError(openai.InternalServerError): # type: ignore
message, message,
llm_provider, llm_provider,
model, model,
response: httpx.Response, response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None, litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None, max_retries: Optional[int] = None,
num_retries: Optional[int] = None, num_retries: Optional[int] = None,
@ -472,8 +504,18 @@ class InternalServerError(openai.InternalServerError): # type: ignore
self.litellm_debug_info = litellm_debug_info self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries self.max_retries = max_retries
self.num_retries = num_retries self.num_retries = num_retries
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=" https://cloud.google.com/vertex-ai/",
),
)
else:
self.response = response
super().__init__( super().__init__(
self.message, response=response, body=None self.message, response=self.response, body=None
) # Call the base class constructor with the parameters it needs ) # Call the base class constructor with the parameters it needs
def __str__(self): def __str__(self):

View file

@ -366,8 +366,6 @@ class OpenTelemetry(CustomLogger):
) )
message = choice.get("message") message = choice.get("message")
if not isinstance(message, dict):
message = message.dict()
tool_calls = message.get("tool_calls") tool_calls = message.get("tool_calls")
if tool_calls: if tool_calls:
span.set_attribute( span.set_attribute(

View file

View file

@ -3,6 +3,7 @@
from functools import partial from functools import partial
import os, types import os, types
import traceback
import json import json
from enum import Enum from enum import Enum
import requests, copy # type: ignore import requests, copy # type: ignore
@ -242,12 +243,12 @@ class PredibaseChatCompletion(BaseLLM):
"details" in completion_response "details" in completion_response
and "tokens" in completion_response["details"] and "tokens" in completion_response["details"]
): ):
model_response.choices[0].finish_reason = completion_response[ model_response.choices[0].finish_reason = map_finish_reason(
"details" completion_response["details"]["finish_reason"]
]["finish_reason"] )
sum_logprob = 0 sum_logprob = 0
for token in completion_response["details"]["tokens"]: for token in completion_response["details"]["tokens"]:
if token["logprob"] != None: if token["logprob"] is not None:
sum_logprob += token["logprob"] sum_logprob += token["logprob"]
model_response["choices"][0][ model_response["choices"][0][
"message" "message"
@ -265,7 +266,7 @@ class PredibaseChatCompletion(BaseLLM):
): ):
sum_logprob = 0 sum_logprob = 0
for token in item["tokens"]: for token in item["tokens"]:
if token["logprob"] != None: if token["logprob"] is not None:
sum_logprob += token["logprob"] sum_logprob += token["logprob"]
if len(item["generated_text"]) > 0: if len(item["generated_text"]) > 0:
message_obj = Message( message_obj = Message(
@ -275,7 +276,7 @@ class PredibaseChatCompletion(BaseLLM):
else: else:
message_obj = Message(content=None) message_obj = Message(content=None)
choice_obj = Choices( choice_obj = Choices(
finish_reason=item["finish_reason"], finish_reason=map_finish_reason(item["finish_reason"]),
index=idx + 1, index=idx + 1,
message=message_obj, message=message_obj,
) )
@ -285,10 +286,8 @@ class PredibaseChatCompletion(BaseLLM):
## CALCULATING USAGE ## CALCULATING USAGE
prompt_tokens = 0 prompt_tokens = 0
try: try:
prompt_tokens = len( prompt_tokens = litellm.token_counter(messages=messages)
encoding.encode(model_response["choices"][0]["message"]["content"]) except Exception:
) ##[TODO] use a model-specific tokenizer here
except:
# this should remain non blocking we should not block a response returning if calculating usage fails # this should remain non blocking we should not block a response returning if calculating usage fails
pass pass
output_text = model_response["choices"][0]["message"].get("content", "") output_text = model_response["choices"][0]["message"].get("content", "")
@ -331,6 +330,7 @@ class PredibaseChatCompletion(BaseLLM):
logging_obj, logging_obj,
optional_params: dict, optional_params: dict,
tenant_id: str, tenant_id: str,
timeout: Union[float, httpx.Timeout],
acompletion=None, acompletion=None,
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,
@ -340,6 +340,7 @@ class PredibaseChatCompletion(BaseLLM):
completion_url = "" completion_url = ""
input_text = "" input_text = ""
base_url = "https://serving.app.predibase.com" base_url = "https://serving.app.predibase.com"
if "https" in model: if "https" in model:
completion_url = model completion_url = model
elif api_base: elif api_base:
@ -349,7 +350,7 @@ class PredibaseChatCompletion(BaseLLM):
completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}" completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}"
if optional_params.get("stream", False) == True: if optional_params.get("stream", False) is True:
completion_url += "/generate_stream" completion_url += "/generate_stream"
else: else:
completion_url += "/generate" completion_url += "/generate"
@ -393,9 +394,9 @@ class PredibaseChatCompletion(BaseLLM):
}, },
) )
## COMPLETION CALL ## COMPLETION CALL
if acompletion == True: if acompletion is True:
### ASYNC STREAMING ### ASYNC STREAMING
if stream == True: if stream is True:
return self.async_streaming( return self.async_streaming(
model=model, model=model,
messages=messages, messages=messages,
@ -410,6 +411,7 @@ class PredibaseChatCompletion(BaseLLM):
litellm_params=litellm_params, litellm_params=litellm_params,
logger_fn=logger_fn, logger_fn=logger_fn,
headers=headers, headers=headers,
timeout=timeout,
) # type: ignore ) # type: ignore
else: else:
### ASYNC COMPLETION ### ASYNC COMPLETION
@ -428,10 +430,11 @@ class PredibaseChatCompletion(BaseLLM):
litellm_params=litellm_params, litellm_params=litellm_params,
logger_fn=logger_fn, logger_fn=logger_fn,
headers=headers, headers=headers,
timeout=timeout,
) # type: ignore ) # type: ignore
### SYNC STREAMING ### SYNC STREAMING
if stream == True: if stream is True:
response = requests.post( response = requests.post(
completion_url, completion_url,
headers=headers, headers=headers,
@ -452,7 +455,6 @@ class PredibaseChatCompletion(BaseLLM):
headers=headers, headers=headers,
data=json.dumps(data), data=json.dumps(data),
) )
return self.process_response( return self.process_response(
model=model, model=model,
response=response, response=response,
@ -480,23 +482,26 @@ class PredibaseChatCompletion(BaseLLM):
stream, stream,
data: dict, data: dict,
optional_params: dict, optional_params: dict,
timeout: Union[float, httpx.Timeout],
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,
headers={}, headers={},
) -> ModelResponse: ) -> ModelResponse:
self.async_handler = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0) async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
)
try: try:
response = await self.async_handler.post( response = await async_handler.post(
api_base, headers=headers, data=json.dumps(data) api_base, headers=headers, data=json.dumps(data)
) )
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
raise PredibaseError( raise PredibaseError(
status_code=e.response.status_code, message=e.response.text status_code=e.response.status_code,
message="HTTPStatusError - {}".format(e.response.text),
) )
except Exception as e: except Exception as e:
raise PredibaseError(status_code=500, message=str(e)) raise PredibaseError(
status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
)
return self.process_response( return self.process_response(
model=model, model=model,
response=response, response=response,
@ -522,6 +527,7 @@ class PredibaseChatCompletion(BaseLLM):
api_key, api_key,
logging_obj, logging_obj,
data: dict, data: dict,
timeout: Union[float, httpx.Timeout],
optional_params=None, optional_params=None,
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,

View file

@ -432,9 +432,9 @@ def mock_completion(
if isinstance(mock_response, openai.APIError): if isinstance(mock_response, openai.APIError):
raise mock_response raise mock_response
raise litellm.APIError( raise litellm.APIError(
status_code=500, # type: ignore status_code=getattr(mock_response, "status_code", 500), # type: ignore
message=str(mock_response), message=getattr(mock_response, "text", str(mock_response)),
llm_provider="openai", # type: ignore llm_provider=getattr(mock_response, "llm_provider", "openai"), # type: ignore
model=model, # type: ignore model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"), request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
) )
@ -1949,7 +1949,8 @@ def completion(
) )
api_base = ( api_base = (
optional_params.pop("api_base", None) api_base
or optional_params.pop("api_base", None)
or optional_params.pop("base_url", None) or optional_params.pop("base_url", None)
or litellm.api_base or litellm.api_base
or get_secret("PREDIBASE_API_BASE") or get_secret("PREDIBASE_API_BASE")
@ -1977,12 +1978,13 @@ def completion(
custom_prompt_dict=custom_prompt_dict, custom_prompt_dict=custom_prompt_dict,
api_key=api_key, api_key=api_key,
tenant_id=tenant_id, tenant_id=tenant_id,
timeout=timeout,
) )
if ( if (
"stream" in optional_params "stream" in optional_params
and optional_params["stream"] == True and optional_params["stream"] is True
and acompletion == False and acompletion is False
): ):
return _model_response return _model_response
response = _model_response response = _model_response

View file

@ -3009,32 +3009,37 @@
"litellm_provider": "sagemaker", "litellm_provider": "sagemaker",
"mode": "chat" "mode": "chat"
}, },
"together-ai-up-to-3b": { "together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001, "input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001, "output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-3.1b-7b": { "together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002, "input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002, "output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-7.1b-20b": { "together-ai-8.1b-21b": {
"max_tokens": 1000, "max_tokens": 1000,
"input_cost_per_token": 0.0000004, "input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000004, "output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-20.1b-40b": { "together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008, "input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008, "output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-40.1b-70b": { "together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009, "input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009, "output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": { "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006, "input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006, "output_cost_per_token": 0.0000006,

View file

@ -1,7 +1,12 @@
import json import json
import logging import logging
from logging import Formatter from logging import Formatter
import sys import os
from litellm import json_logs
# Set default log level to INFO
log_level = os.getenv("LITELLM_LOG", "INFO")
numeric_level: str = getattr(logging, log_level.upper())
class JsonFormatter(Formatter): class JsonFormatter(Formatter):
@ -16,6 +21,14 @@ class JsonFormatter(Formatter):
logger = logging.root logger = logging.root
handler = logging.StreamHandler() handler = logging.StreamHandler()
handler.setFormatter(JsonFormatter()) if json_logs:
handler.setFormatter(JsonFormatter())
else:
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
datefmt="%H:%M:%S",
)
handler.setFormatter(formatter)
logger.handlers = [handler] logger.handlers = [handler]
logger.setLevel(logging.INFO) logger.setLevel(numeric_level)

View file

@ -8,6 +8,17 @@ model_list:
- model_name: llama3-70b-8192 - model_name: llama3-70b-8192
litellm_params: litellm_params:
model: groq/llama3-70b-8192 model: groq/llama3-70b-8192
- model_name: fake-openai-endpoint
litellm_params:
model: predibase/llama-3-8b-instruct
api_base: "http://0.0.0.0:8081"
api_key: os.environ/PREDIBASE_API_KEY
tenant_id: os.environ/PREDIBASE_TENANT_ID
max_retries: 0
temperature: 0.1
max_new_tokens: 256
return_full_text: false
# - litellm_params: # - litellm_params:
# api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ # api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
# api_key: os.environ/AZURE_EUROPE_API_KEY # api_key: os.environ/AZURE_EUROPE_API_KEY
@ -57,6 +68,8 @@ router_settings:
litellm_settings: litellm_settings:
success_callback: ["langfuse"] success_callback: ["langfuse"]
cache: True cache: True
failure_callback: ["langfuse"]
general_settings: general_settings:
alerting: ["email"] alerting: ["email"]

View file

@ -160,6 +160,7 @@ from litellm.proxy.auth.auth_checks import (
get_user_object, get_user_object,
allowed_routes_check, allowed_routes_check,
get_actual_routes, get_actual_routes,
log_to_opentelemetry,
) )
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from litellm.exceptions import RejectedRequestError from litellm.exceptions import RejectedRequestError
@ -368,6 +369,11 @@ from typing import Dict
api_key_header = APIKeyHeader( api_key_header = APIKeyHeader(
name="Authorization", auto_error=False, description="Bearer token" name="Authorization", auto_error=False, description="Bearer token"
) )
azure_api_key_header = APIKeyHeader(
name="API-Key",
auto_error=False,
description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
)
user_api_base = None user_api_base = None
user_model = None user_model = None
user_debug = False user_debug = False
@ -508,13 +514,19 @@ async def check_request_disconnection(request: Request, llm_api_call_task):
async def user_api_key_auth( async def user_api_key_auth(
request: Request, api_key: str = fastapi.Security(api_key_header) request: Request,
api_key: str = fastapi.Security(api_key_header),
azure_api_key_header: str = fastapi.Security(azure_api_key_header),
) -> UserAPIKeyAuth: ) -> UserAPIKeyAuth:
global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj
try: try:
if isinstance(api_key, str): if isinstance(api_key, str):
passed_in_key = api_key passed_in_key = api_key
api_key = _get_bearer_token(api_key=api_key) api_key = _get_bearer_token(api_key=api_key)
elif isinstance(azure_api_key_header, str):
api_key = azure_api_key_header
parent_otel_span: Optional[Span] = None parent_otel_span: Optional[Span] = None
if open_telemetry_logger is not None: if open_telemetry_logger is not None:
parent_otel_span = open_telemetry_logger.tracer.start_span( parent_otel_span = open_telemetry_logger.tracer.start_span(
@ -1495,7 +1507,7 @@ async def user_api_key_auth(
) )
if valid_token is None: if valid_token is None:
# No token was found when looking up in the DB # No token was found when looking up in the DB
raise Exception("Invalid token passed") raise Exception("Invalid proxy server token passed")
if valid_token_dict is not None: if valid_token_dict is not None:
if user_id_information is not None and _is_user_proxy_admin( if user_id_information is not None and _is_user_proxy_admin(
user_id_information user_id_information
@ -1528,6 +1540,14 @@ async def user_api_key_auth(
str(e) str(e)
) )
) )
# Log this exception to OTEL
if open_telemetry_logger is not None:
await open_telemetry_logger.async_post_call_failure_hook(
original_exception=e,
user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
)
verbose_proxy_logger.debug(traceback.format_exc()) verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, litellm.BudgetExceededError): if isinstance(e, litellm.BudgetExceededError):
raise ProxyException( raise ProxyException(
@ -7803,6 +7823,10 @@ async def get_global_spend_report(
default=None, default=None,
description="Time till which to view spend", description="Time till which to view spend",
), ),
group_by: Optional[Literal["team", "customer"]] = fastapi.Query(
default="team",
description="Group spend by internal team or customer",
),
): ):
""" """
Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model
@ -7849,69 +7873,130 @@ async def get_global_spend_report(
f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys" f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
) )
# first get data from spend logs -> SpendByModelApiKey if group_by == "team":
# then read data from "SpendByModelApiKey" to format the response obj # first get data from spend logs -> SpendByModelApiKey
sql_query = """ # then read data from "SpendByModelApiKey" to format the response obj
sql_query = """
WITH SpendByModelApiKey AS ( WITH SpendByModelApiKey AS (
SELECT SELECT
date_trunc('day', sl."startTime") AS group_by_day, date_trunc('day', sl."startTime") AS group_by_day,
COALESCE(tt.team_alias, 'Unassigned Team') AS team_name, COALESCE(tt.team_alias, 'Unassigned Team') AS team_name,
sl.model, sl.model,
sl.api_key, sl.api_key,
SUM(sl.spend) AS model_api_spend, SUM(sl.spend) AS model_api_spend,
SUM(sl.total_tokens) AS model_api_tokens SUM(sl.total_tokens) AS model_api_tokens
FROM FROM
"LiteLLM_SpendLogs" sl "LiteLLM_SpendLogs" sl
LEFT JOIN LEFT JOIN
"LiteLLM_TeamTable" tt "LiteLLM_TeamTable" tt
ON ON
sl.team_id = tt.team_id sl.team_id = tt.team_id
WHERE WHERE
sl."startTime" BETWEEN $1::date AND $2::date sl."startTime" BETWEEN $1::date AND $2::date
GROUP BY GROUP BY
date_trunc('day', sl."startTime"), date_trunc('day', sl."startTime"),
tt.team_alias, tt.team_alias,
sl.model, sl.model,
sl.api_key sl.api_key
) )
SELECT
group_by_day,
jsonb_agg(jsonb_build_object(
'team_name', team_name,
'total_spend', total_spend,
'metadata', metadata
)) AS teams
FROM (
SELECT
group_by_day,
team_name,
SUM(model_api_spend) AS total_spend,
jsonb_agg(jsonb_build_object(
'model', model,
'api_key', api_key,
'spend', model_api_spend,
'total_tokens', model_api_tokens
)) AS metadata
FROM
SpendByModelApiKey
GROUP BY
group_by_day,
team_name
) AS aggregated
GROUP BY
group_by_day
ORDER BY
group_by_day;
"""
db_response = await prisma_client.db.query_raw(
sql_query, start_date_obj, end_date_obj
)
if db_response is None:
return []
return db_response
elif group_by == "customer":
sql_query = """
WITH SpendByModelApiKey AS (
SELECT
date_trunc('day', sl."startTime") AS group_by_day,
sl.end_user AS customer,
sl.model,
sl.api_key,
SUM(sl.spend) AS model_api_spend,
SUM(sl.total_tokens) AS model_api_tokens
FROM
"LiteLLM_SpendLogs" sl
WHERE
sl."startTime" BETWEEN $1::date AND $2::date
GROUP BY
date_trunc('day', sl."startTime"),
customer,
sl.model,
sl.api_key
)
SELECT SELECT
group_by_day, group_by_day,
jsonb_agg(jsonb_build_object( jsonb_agg(jsonb_build_object(
'team_name', team_name, 'customer', customer,
'total_spend', total_spend, 'total_spend', total_spend,
'metadata', metadata 'metadata', metadata
)) AS teams )) AS customers
FROM ( FROM
SELECT (
group_by_day, SELECT
team_name, group_by_day,
SUM(model_api_spend) AS total_spend, customer,
jsonb_agg(jsonb_build_object( SUM(model_api_spend) AS total_spend,
'model', model, jsonb_agg(jsonb_build_object(
'api_key', api_key, 'model', model,
'spend', model_api_spend, 'api_key', api_key,
'total_tokens', model_api_tokens 'spend', model_api_spend,
)) AS metadata 'total_tokens', model_api_tokens
FROM )) AS metadata
SpendByModelApiKey FROM
GROUP BY SpendByModelApiKey
group_by_day, GROUP BY
team_name group_by_day,
) AS aggregated customer
) AS aggregated
GROUP BY GROUP BY
group_by_day group_by_day
ORDER BY ORDER BY
group_by_day; group_by_day;
""" """
db_response = await prisma_client.db.query_raw( db_response = await prisma_client.db.query_raw(
sql_query, start_date_obj, end_date_obj sql_query, start_date_obj, end_date_obj
) )
if db_response is None: if db_response is None:
return [] return []
return db_response return db_response
except Exception as e: except Exception as e:
raise HTTPException( raise HTTPException(

View file

@ -2056,12 +2056,15 @@ class Router:
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}") verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
generic_fallback_idx: Optional[int] = None generic_fallback_idx: Optional[int] = None
## check for specific model group-specific fallbacks ## check for specific model group-specific fallbacks
for idx, item in enumerate(fallbacks): if isinstance(fallbacks, list):
if list(item.keys())[0] == model_group: fallback_model_group = fallbacks
fallback_model_group = item[model_group] elif isinstance(fallbacks, dict):
break for idx, item in enumerate(fallbacks):
elif list(item.keys())[0] == "*": if list(item.keys())[0] == model_group:
generic_fallback_idx = idx fallback_model_group = item[model_group]
break
elif list(item.keys())[0] == "*":
generic_fallback_idx = idx
## if none, check for generic fallback ## if none, check for generic fallback
if ( if (
fallback_model_group is None fallback_model_group is None
@ -2310,13 +2313,16 @@ class Router:
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}") verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
fallback_model_group = None fallback_model_group = None
generic_fallback_idx: Optional[int] = None generic_fallback_idx: Optional[int] = None
## check for specific model group-specific fallbacks if isinstance(fallbacks, list):
for idx, item in enumerate(fallbacks): fallback_model_group = fallbacks
if list(item.keys())[0] == model_group: elif isinstance(fallbacks, dict):
fallback_model_group = item[model_group] ## check for specific model group-specific fallbacks
break for idx, item in enumerate(fallbacks):
elif list(item.keys())[0] == "*": if list(item.keys())[0] == model_group:
generic_fallback_idx = idx fallback_model_group = item[model_group]
break
elif list(item.keys())[0] == "*":
generic_fallback_idx = idx
## if none, check for generic fallback ## if none, check for generic fallback
if ( if (
fallback_model_group is None fallback_model_group is None

View file

@ -345,7 +345,7 @@ def test_completion_claude_3_function_call(model):
drop_params=True, drop_params=True,
) )
# Add any assertions, here to check response args # Add any assertions here to check response args
print(response) print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str) assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance( assert isinstance(
@ -530,6 +530,7 @@ def test_completion_cohere_command_r_plus_function_call():
messages=messages, messages=messages,
tools=tools, tools=tools,
tool_choice="auto", tool_choice="auto",
force_single_step=True,
) )
print(second_response) print(second_response)
except Exception as e: except Exception as e:

View file

@ -517,3 +517,51 @@ def test_groq_response_cost_tracking(is_streaming):
assert response_cost > 0.0 assert response_cost > 0.0
print(f"response_cost: {response_cost}") print(f"response_cost: {response_cost}")
def test_together_ai_qwen_completion_cost():
input_kwargs = {
"completion_response": litellm.ModelResponse(
**{
"id": "890db0c33c4ef94b-SJC",
"choices": [
{
"finish_reason": "eos",
"index": 0,
"message": {
"content": "I am Qwen, a large language model created by Alibaba Cloud.",
"role": "assistant",
},
}
],
"created": 1717900130,
"model": "together_ai/qwen/Qwen2-72B-Instruct",
"object": "chat.completion",
"system_fingerprint": None,
"usage": {
"completion_tokens": 15,
"prompt_tokens": 23,
"total_tokens": 38,
},
}
),
"model": "qwen/Qwen2-72B-Instruct",
"prompt": "",
"messages": [],
"completion": "",
"total_time": 0.0,
"call_type": "completion",
"custom_llm_provider": "together_ai",
"region_name": None,
"size": None,
"quality": None,
"n": None,
"custom_cost_per_token": None,
"custom_cost_per_second": None,
}
response = litellm.cost_calculator.get_model_params_and_category(
model_name="qwen/Qwen2-72B-Instruct"
)
assert response == "together-ai-41.1b-80b"

View file

@ -3,6 +3,7 @@ import os
import sys import sys
import traceback import traceback
import subprocess, asyncio import subprocess, asyncio
from typing import Any
sys.path.insert( sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
@ -19,6 +20,7 @@ from litellm import (
) )
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import pytest import pytest
from unittest.mock import patch, MagicMock
litellm.vertex_project = "pathrise-convert-1606954137718" litellm.vertex_project = "pathrise-convert-1606954137718"
litellm.vertex_location = "us-central1" litellm.vertex_location = "us-central1"
@ -655,3 +657,47 @@ def test_litellm_predibase_exception():
# accuracy_score = counts[True]/(counts[True] + counts[False]) # accuracy_score = counts[True]/(counts[True] + counts[False])
# print(f"accuracy_score: {accuracy_score}") # print(f"accuracy_score: {accuracy_score}")
@pytest.mark.parametrize("provider", ["predibase"])
def test_exception_mapping(provider):
"""
For predibase, run through a set of mock exceptions
assert that they are being mapped correctly
"""
litellm.set_verbose = True
error_map = {
400: litellm.BadRequestError,
401: litellm.AuthenticationError,
404: litellm.NotFoundError,
408: litellm.Timeout,
429: litellm.RateLimitError,
500: litellm.InternalServerError,
503: litellm.ServiceUnavailableError,
}
for code, expected_exception in error_map.items():
mock_response = Exception()
setattr(mock_response, "text", "This is an error message")
setattr(mock_response, "llm_provider", provider)
setattr(mock_response, "status_code", code)
response: Any = None
try:
response = completion(
model="{}/test-model".format(provider),
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response=mock_response,
)
except expected_exception:
continue
except Exception as e:
response = "{}\n{}".format(str(e), traceback.format_exc())
pytest.fail(
"Did not raise expected exception. Expected={}, Return={},".format(
expected_exception, response
)
)
pass

View file

@ -272,7 +272,7 @@ def test_call_with_invalid_key(prisma_client):
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.message) print(e.message)
assert "Authentication Error, Invalid token passed" in e.message assert "Authentication Error, Invalid proxy server token passed" in e.message
pass pass

View file

@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks):
assert isinstance(response, litellm.ModelResponse) assert isinstance(response, litellm.ModelResponse)
assert response.model is not None and response.model == "gpt-4o" assert response.model is not None and response.model == "gpt-4o"
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_client_side_fallbacks_list(sync_mode):
"""
Tests Client Side Fallbacks
User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work
"""
router = Router(
model_list=[
{
"model_name": "bad-model",
"litellm_params": {
"model": "openai/my-bad-model",
"api_key": "my-bad-api-key",
},
},
{
"model_name": "my-good-model",
"litellm_params": {
"model": "gpt-4o",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
],
)
if sync_mode:
response = router.completion(
model="bad-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
fallbacks=["my-good-model"],
mock_testing_fallbacks=True,
mock_response="Hey! nice day",
)
else:
response = await router.acompletion(
model="bad-model",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
fallbacks=["my-good-model"],
mock_testing_fallbacks=True,
mock_response="Hey! nice day",
)
assert isinstance(response, litellm.ModelResponse)
assert response.model is not None and response.model == "gpt-4o"

View file

@ -326,6 +326,22 @@ class Function(OpenAIObject):
super(Function, self).__init__(**data) super(Function, self).__init__(**data)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class ChatCompletionDeltaToolCall(OpenAIObject): class ChatCompletionDeltaToolCall(OpenAIObject):
id: Optional[str] = None id: Optional[str] = None
@ -385,6 +401,22 @@ class ChatCompletionMessageToolCall(OpenAIObject):
else: else:
self.type = "function" self.type = "function"
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class Message(OpenAIObject): class Message(OpenAIObject):
def __init__( def __init__(
@ -3929,54 +3961,6 @@ def client(original_function):
return wrapper return wrapper
####### USAGE CALCULATOR ################
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name):
import re
model_name = model_name.lower()
params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if params_match != None:
params_match = params_match.group(1)
params_match = params_match.replace("b", "")
params_billion = float(params_match)
# Determine the category based on the number of parameters
if params_billion <= 3.0:
category = "together-ai-up-to-3b"
elif params_billion <= 7.0:
category = "together-ai-3.1b-7b"
elif params_billion <= 20.0:
category = "together-ai-7.1b-20b"
elif params_billion <= 40.0:
category = "together-ai-20.1b-40b"
elif params_billion <= 70.0:
category = "together-ai-40.1b-70b"
return category
return None
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
# see https://replicate.com/pricing
a100_40gb_price_per_second_public = 0.001150
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = getattr(completion_response, "ended", time.time())
total_time = end_time - start_time
return a100_80gb_price_per_second_public * total_time / 1000
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def _select_tokenizer(model: str): def _select_tokenizer(model: str):
if model in litellm.cohere_models and "command-r" in model: if model in litellm.cohere_models and "command-r" in model:
@ -4363,7 +4347,7 @@ def _cost_per_token_custom_pricing_helper(
def cost_per_token( def cost_per_token(
model="", model: str = "",
prompt_tokens=0, prompt_tokens=0,
completion_tokens=0, completion_tokens=0,
response_time_ms=None, response_time_ms=None,
@ -4388,6 +4372,8 @@ def cost_per_token(
Returns: Returns:
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively. tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
""" """
if model is None:
raise Exception("Invalid arg. Model cannot be none.")
## CUSTOM PRICING ## ## CUSTOM PRICING ##
response_cost = _cost_per_token_custom_pricing_helper( response_cost = _cost_per_token_custom_pricing_helper(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
@ -4560,213 +4546,6 @@ def cost_per_token(
) )
def completion_cost(
completion_response=None,
model=None,
prompt="",
messages: List = [],
completion="",
total_time=0.0, # used for replicate, sagemaker
call_type: Literal[
"embedding",
"aembedding",
"completion",
"acompletion",
"atext_completion",
"text_completion",
"image_generation",
"aimage_generation",
"moderation",
"amoderation",
"atranscription",
"transcription",
"aspeech",
"speech",
] = "completion",
### REGION ###
custom_llm_provider=None,
region_name=None, # used for bedrock pricing
### IMAGE GEN ###
size=None,
quality=None,
n=None, # number of images
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
) -> float:
"""
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
Parameters:
completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
[OPTIONAL PARAMS]
model (str): Optional. The name of the language model used in the completion calls
prompt (str): Optional. The input prompt passed to the llm
completion (str): Optional. The output completion text from the llm
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
Returns:
float: The cost in USD dollars for the completion based on the provided parameters.
Exceptions:
Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
Note:
- If completion_response is provided, the function extracts token information and the model name from it.
- If completion_response is not provided, the function calculates token counts based on the model and input text.
- The cost is calculated based on the model, prompt tokens, and completion tokens.
- For certain models containing "togethercomputer" in the name, prices are based on the model size.
- For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
"""
try:
if (
(call_type == "aimage_generation" or call_type == "image_generation")
and model is not None
and isinstance(model, str)
and len(model) == 0
and custom_llm_provider == "azure"
):
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
completion_tokens = 0
custom_llm_provider = None
if completion_response is not None:
# get input/output tokens from completion_response
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = model or completion_response.get(
"model", None
) # check if user passed an override for model, if it's none check completion_response['model']
if hasattr(completion_response, "_hidden_params"):
if (
completion_response._hidden_params.get("model", None) is not None
and len(completion_response._hidden_params["model"]) > 0
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
size = completion_response._hidden_params.get(
"optional_params", {}
).get(
"size", "1024-x-1024"
) # openai default
quality = completion_response._hidden_params.get(
"optional_params", {}
).get(
"quality", "standard"
) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get(
"n", 1
) # openai default
else:
if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages)
elif len(prompt) > 0:
prompt_tokens = token_counter(model=model, text=prompt)
completion_tokens = token_counter(model=model, text=completion)
if model == None:
raise ValueError(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if (
call_type == CallTypes.image_generation.value
or call_type == CallTypes.aimage_generation.value
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
# https://cloud.google.com/vertex-ai/generative-ai/pricing
# Vertex Charges Flat $0.20 per image
return 0.020
# fix size to match naming convention
if "x" in size and "-x-" not in size:
size = size.replace("x", "-x-")
image_gen_model_name = f"{size}/{model}"
image_gen_model_name_with_quality = image_gen_model_name
if quality is not None:
image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
size = size.split("-x-")
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(
f"Model={image_gen_model_name} not found in completion cost model map"
)
# Calculate cost based on prompt_tokens, completion_tokens
if (
"togethercomputer" in model
or "together_ai" in model
or custom_llm_provider == "together_ai"
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = cost_per_token(
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
custom_llm_provider=custom_llm_provider,
response_time_ms=total_time,
region_name=region_name,
custom_cost_per_second=custom_cost_per_second,
custom_cost_per_token=custom_cost_per_token,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
print_verbose(
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
)
return _final_cost
except Exception as e:
raise e
def supports_httpx_timeout(custom_llm_provider: str) -> bool: def supports_httpx_timeout(custom_llm_provider: str) -> bool:
""" """
Helper function to know if a provider implementation supports httpx timeout Helper function to know if a provider implementation supports httpx timeout
@ -8986,6 +8765,75 @@ def exception_type(
response=original_exception.response, response=original_exception.response,
litellm_debug_info=extra_information, litellm_debug_info=extra_information,
) )
elif hasattr(original_exception, "status_code"):
if original_exception.status_code == 500:
exception_mapping_worked = True
raise litellm.InternalServerError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 401:
exception_mapping_worked = True
raise AuthenticationError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 400:
exception_mapping_worked = True
raise BadRequestError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 404:
exception_mapping_worked = True
raise NotFoundError(
message=f"PredibaseException - {original_exception.message}",
llm_provider="predibase",
model=model,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
raise Timeout(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 422:
exception_mapping_worked = True
raise BadRequestError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 429:
exception_mapping_worked = True
raise RateLimitError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 503:
exception_mapping_worked = True
raise ServiceUnavailableError(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif original_exception.status_code == 504: # gateway timeout error
exception_mapping_worked = True
raise Timeout(
message=f"PredibaseException - {original_exception.message}",
model=model,
llm_provider=custom_llm_provider,
litellm_debug_info=extra_information,
)
elif custom_llm_provider == "bedrock": elif custom_llm_provider == "bedrock":
if ( if (
"too many tokens" in error_str "too many tokens" in error_str

View file

@ -3009,32 +3009,37 @@
"litellm_provider": "sagemaker", "litellm_provider": "sagemaker",
"mode": "chat" "mode": "chat"
}, },
"together-ai-up-to-3b": { "together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001, "input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001, "output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-3.1b-7b": { "together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002, "input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002, "output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-7.1b-20b": { "together-ai-8.1b-21b": {
"max_tokens": 1000, "max_tokens": 1000,
"input_cost_per_token": 0.0000004, "input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000004, "output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-20.1b-40b": { "together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008, "input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008, "output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-40.1b-70b": { "together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009, "input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009, "output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai" "litellm_provider": "together_ai"
}, },
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": { "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006, "input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006, "output_cost_per_token": 0.0000006,

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.40.7" version = "1.40.8"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -84,7 +84,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.40.7" version = "1.40.8"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

View file

@ -1,3 +1,3 @@
ignore = ["F405"] ignore = ["F405", "E402"]
extend-select = ["E501"] extend-select = ["E501"]
line-length = 120 line-length = 120

View file

@ -119,9 +119,24 @@ const ChatUI: React.FC<ChatUIProps> = ({
// Now, 'options' contains the list you wanted // Now, 'options' contains the list you wanted
console.log(options); // You can log it to verify the list console.log(options); // You can log it to verify the list
// setModelInfo(options) should be inside the if block to avoid setting it when no data is available // if options.length > 0, only store unique values
setModelInfo(options); if (options.length > 0) {
const uniqueModels = Array.from(new Set(options));
console.log("Unique models:", uniqueModels);
// sort uniqueModels alphabetically
uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label));
console.log("Model info:", modelInfo);
// setModelInfo(options) should be inside the if block to avoid setting it when no data is available
setModelInfo(uniqueModels);
}
setSelectedModel(fetchedAvailableModels.data[0].id); setSelectedModel(fetchedAvailableModels.data[0].id);
} }
} catch (error) { } catch (error) {

View file

@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
setSelectedAPIKey(key); setSelectedAPIKey(key);
}} }}
> >
{key["key_alias"]} (Enterpise only Feature) {key["key_alias"]} (Enterprise only Feature)
</SelectItem> </SelectItem>
); );
} }
@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
setSelectedCustomer(user); setSelectedCustomer(user);
}} }}
> >
{user} (Enterpise only Feature) {user} (Enterprise only Feature)
</SelectItem> </SelectItem>
); );
}) })

View file

@ -114,7 +114,7 @@ const Navbar: React.FC<NavbarProps> = ({
textDecoration: "underline", textDecoration: "underline",
}} }}
> >
Get enterpise license Get enterprise license
</a> </a>
</div> </div>
) : null} ) : null}

View file

@ -832,7 +832,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
// @ts-ignore // @ts-ignore
disabled={true} disabled={true}
> >
{tag} (Enterpise only Feature) {tag} (Enterprise only Feature)
</SelectItem> </SelectItem>
); );
})} })}