LiteLLM Minor Fixes & Improvements (10/05/2024) (#6083)

* docs(prompt_caching.md): add prompt caching cost calc example to docs

* docs(prompt_caching.md): add proxy examples to docs

* feat(utils.py): expose new helper `supports_prompt_caching()` to check if a model supports prompt caching

* docs(prompt_caching.md): add docs on checking model support for prompt caching

* build: fix invalid json
This commit is contained in:
Krish Dholakia 2024-10-05 18:59:11 -04:00 committed by GitHub
parent fac3b2ee42
commit f2c0a31e3c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 459 additions and 59 deletions

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Prompt Caching
For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
@ -30,6 +33,9 @@ For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usa
Note: OpenAI caching is only available for prompts containing 1024 tokens or more
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
@ -87,6 +93,90 @@ assert "prompt_tokens_details" in response.usage
assert response.usage.prompt_tokens_details.cached_tokens > 0
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-4o
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python
from openai import OpenAI
import os
client = OpenAI(
api_key="LITELLM_PROXY_KEY", # sk-1234
base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
)
for _ in range(2):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
}
],
},
],
temperature=0.2,
max_tokens=10,
)
print("response=", response)
print("response.usage=", response.usage)
assert "prompt_tokens_details" in response.usage
assert response.usage.prompt_tokens_details.cached_tokens > 0
```
</TabItem>
</Tabs>
### Anthropic Example
Anthropic charges for cache writes.
@ -95,6 +185,9 @@ Specify the content to cache with `"cache_control": {"type": "ephemeral"}`.
If you pass that in for any other llm provider, it will be ignored.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import litellm
@ -129,6 +222,65 @@ response = completion(
print(response.usage)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python
from openai import OpenAI
import os
client = OpenAI(
api_key="LITELLM_PROXY_KEY", # sk-1234
base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
)
response = client.chat.completions.create(
model="claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement" * 400,
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
]
)
print(response.usage)
```
</TabItem>
</Tabs>
### Deepeek Example
@ -196,4 +348,155 @@ response_2 = litellm.completion(model=model_name, messages=message_2)
# Add any assertions here to check the response
print(response_2.usage)
```
```
## Calculate Cost
Cost cache-hit prompt tokens can differ from cache-miss prompt tokens.
Use the `completion_cost()` function for calculating cost ([handles prompt caching cost calculation](https://github.com/BerriAI/litellm/blob/f7ce1173f3315cc6cae06cf9bcf12e54a2a19705/litellm/llms/anthropic/cost_calculation.py#L12) as well). [**See more helper functions**](./token_usage.md)
```python
cost = completion_cost(completion_response=response, model=model)
```
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion, completion_cost
import litellm
import os
litellm.set_verbose = True # 👈 SEE RAW REQUEST
os.environ["ANTHROPIC_API_KEY"] = ""
model = "anthropic/claude-3-5-sonnet-20240620"
response = completion(
model=model,
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement" * 400,
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
]
)
print(response.usage)
cost = completion_cost(completion_response=response, model=model)
formatted_string = f"${float(cost):.10f}"
print(formatted_string)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
LiteLLM returns the calculated cost in the response headers - `x-litellm-response-cost`
```python
from openai import OpenAI
client = OpenAI(
api_key="LITELLM_PROXY_KEY", # sk-1234..
base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
)
response = client.chat.completions.with_raw_response.create(
messages=[{
"role": "user",
"content": "Say this is a test",
}],
model="gpt-3.5-turbo",
)
print(response.headers.get('x-litellm-response-cost'))
completion = response.parse() # get the object that `chat.completions.create()` would have returned
print(completion)
```
</TabItem>
</Tabs>
## Check Model Support
Check if a model supports prompt caching with `supports_prompt_caching()`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm.utils import supports_prompt_caching
supports_pc: bool = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
assert supports_pc
```
</TabItem>
<TabItem value="proxy" label="PROXY">
Use the `/model/info` endpoint to check if a model on the proxy supports prompt caching
1. Setup config.yaml
```yaml
model_list:
- model_name: claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
-H 'Authorization: Bearer sk-1234' \
```
**Expected Response**
```bash
{
"data": [
{
"model_name": "claude-3-5-sonnet-20240620",
"litellm_params": {
"model": "anthropic/claude-3-5-sonnet-20240620"
},
"model_info": {
"key": "claude-3-5-sonnet-20240620",
...
"supports_prompt_caching": true # 👈 LOOK FOR THIS!
}
}
]
}
```
</TabItem>
</Tabs>
This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

View file

@ -9,7 +9,8 @@
"mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4": {
"max_tokens": 4096,
@ -19,7 +20,8 @@
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4o": {
"max_tokens": 4096,
@ -129,7 +131,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4o-2024-05-13": {
"max_tokens": 4096,
@ -141,7 +144,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4o-2024-08-06": {
"max_tokens": 16384,
@ -166,7 +170,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-0314": {
"max_tokens": 4096,
@ -175,7 +180,8 @@
"input_cost_per_token": 0.00003,
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-0613": {
"max_tokens": 4096,
@ -185,7 +191,8 @@
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-32k": {
"max_tokens": 4096,
@ -194,7 +201,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-32k-0314": {
"max_tokens": 4096,
@ -203,7 +211,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-32k-0613": {
"max_tokens": 4096,
@ -212,7 +221,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-turbo": {
"max_tokens": 4096,
@ -224,7 +234,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-turbo-2024-04-09": {
"max_tokens": 4096,
@ -236,7 +247,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-1106-preview": {
"max_tokens": 4096,
@ -247,7 +259,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-0125-preview": {
"max_tokens": 4096,
@ -258,7 +271,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-vision-preview": {
"max_tokens": 4096,
@ -268,7 +282,8 @@
"output_cost_per_token": 0.00003,
"litellm_provider": "openai",
"mode": "chat",
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-1106-vision-preview": {
"max_tokens": 4096,
@ -278,7 +293,8 @@
"output_cost_per_token": 0.00003,
"litellm_provider": "openai",
"mode": "chat",
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo": {
"max_tokens": 4097,
@ -288,7 +304,8 @@
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0301": {
"max_tokens": 4097,
@ -297,7 +314,8 @@
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0613": {
"max_tokens": 4097,
@ -307,7 +325,8 @@
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-1106": {
"max_tokens": 16385,
@ -318,7 +337,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0125": {
"max_tokens": 16385,
@ -329,7 +349,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16385,
@ -338,7 +359,8 @@
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16385,
@ -347,7 +369,8 @@
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"ft:gpt-3.5-turbo": {
"max_tokens": 4096,

View file

@ -1,9 +1,5 @@
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["gcs_bucket"]
- model_name: claude-3-5-sonnet-20240620
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
api_key: os.environ/ANTHROPIC_API_KEY

View file

@ -2179,6 +2179,40 @@ def supports_function_calling(
)
def supports_prompt_caching(
model: str, custom_llm_provider: Optional[str] = None
) -> bool:
"""
Check if the given model supports prompt caching and return a boolean value.
Parameters:
model (str): The model name to be checked.
custom_llm_provider (Optional[str]): The provider to be checked.
Returns:
bool: True if the model supports prompt caching, False otherwise.
Raises:
Exception: If the given model is not found or there's an error in retrieval.
"""
try:
model, custom_llm_provider, _, _ = litellm.get_llm_provider(
model=model, custom_llm_provider=custom_llm_provider
)
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
if model_info.get("supports_prompt_caching", False) is True:
return True
return False
except Exception as e:
raise Exception(
f"Model not found or error in checking prompt caching support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
)
def supports_vision(model: str, custom_llm_provider: Optional[str] = None) -> bool:
"""
Check if the given model supports vision and return a boolean value.

View file

@ -9,7 +9,8 @@
"mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4": {
"max_tokens": 4096,
@ -19,7 +20,8 @@
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4o": {
"max_tokens": 4096,
@ -129,7 +131,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4o-2024-05-13": {
"max_tokens": 4096,
@ -141,7 +144,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4o-2024-08-06": {
"max_tokens": 16384,
@ -166,7 +170,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-0314": {
"max_tokens": 4096,
@ -175,7 +180,8 @@
"input_cost_per_token": 0.00003,
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-0613": {
"max_tokens": 4096,
@ -185,7 +191,8 @@
"output_cost_per_token": 0.00006,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-32k": {
"max_tokens": 4096,
@ -194,7 +201,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-32k-0314": {
"max_tokens": 4096,
@ -203,7 +211,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-32k-0613": {
"max_tokens": 4096,
@ -212,7 +221,8 @@
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-4-turbo": {
"max_tokens": 4096,
@ -224,7 +234,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-turbo-2024-04-09": {
"max_tokens": 4096,
@ -236,7 +247,8 @@
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-1106-preview": {
"max_tokens": 4096,
@ -247,7 +259,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-0125-preview": {
"max_tokens": 4096,
@ -258,7 +271,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-4-vision-preview": {
"max_tokens": 4096,
@ -268,7 +282,8 @@
"output_cost_per_token": 0.00003,
"litellm_provider": "openai",
"mode": "chat",
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-4-1106-vision-preview": {
"max_tokens": 4096,
@ -278,7 +293,8 @@
"output_cost_per_token": 0.00003,
"litellm_provider": "openai",
"mode": "chat",
"supports_vision": true
"supports_vision": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo": {
"max_tokens": 4097,
@ -288,7 +304,8 @@
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0301": {
"max_tokens": 4097,
@ -297,7 +314,8 @@
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0613": {
"max_tokens": 4097,
@ -307,7 +325,8 @@
"output_cost_per_token": 0.000002,
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true
"supports_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-1106": {
"max_tokens": 16385,
@ -318,7 +337,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-0125": {
"max_tokens": 16385,
@ -329,7 +349,8 @@
"litellm_provider": "openai",
"mode": "chat",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"supports_prompt_caching": true
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16385,
@ -338,7 +359,8 @@
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16385,
@ -347,7 +369,8 @@
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
"litellm_provider": "openai",
"mode": "chat"
"mode": "chat",
"supports_prompt_caching": true
},
"ft:gpt-3.5-turbo": {
"max_tokens": 4096,

View file

@ -1,4 +1,6 @@
import os, sys, traceback
import importlib.resources
import json
sys.path.insert(
0, os.path.abspath("../..")
@ -6,7 +8,18 @@ sys.path.insert(
import litellm
import pytest
try:
print(litellm.get_model_cost_map(url="fake-url"))
except Exception as e:
pytest.fail(f"An exception occurred: {e}")
def test_get_model_cost_map():
try:
print(litellm.get_model_cost_map(url="fake-url"))
except Exception as e:
pytest.fail(f"An exception occurred: {e}")
def test_get_backup_model_cost_map():
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
print("inside backup")
content = json.load(f)
print("content", content)

View file

@ -111,3 +111,11 @@ def test_prompt_caching_model(model):
# assert (response.usage.cache_read_input_tokens > 0) or (
# response.usage.cache_creation_input_tokens > 0
# )
def test_supports_prompt_caching():
from litellm.utils import supports_prompt_caching
supports_pc = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
assert supports_pc