anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests

Fixes https://github.com/BerriAI/litellm/issues/5444

* feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic

* feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out

* fix: fix linting errors

* test: mark flaky test
This commit is contained in:
Krish Dholakia 2024-08-31 14:09:35 -07:00 committed by Ishaan Jaff
parent de9efe76ca
commit 47ef1f9191
17 changed files with 432 additions and 84 deletions

View file

@ -843,10 +843,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
from .types.utils import ImageObject
from .llms.custom_llm import CustomLLM
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
from .llms.anthropic.chat import AnthropicConfig
from .llms.anthropic.completion import AnthropicTextConfig
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
from .llms.predibase import PredibaseConfig
from .llms.anthropic_text import AnthropicTextConfig
from .llms.replicate import ReplicateConfig
from .llms.cohere.completion import CohereConfig
from .llms.clarifai import ClarifaiConfig

View file

@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_router as google_cost_router,
)
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token,
)
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
from litellm.types.utils import Usage
from litellm.utils import (
CallTypes,
CostPerToken,
@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
def cost_per_token(
model: str = "",
prompt_tokens: float = 0,
completion_tokens: float = 0,
prompt_tokens: int = 0,
completion_tokens: int = 0,
response_time_ms=None,
custom_llm_provider: Optional[str] = None,
region_name=None,
### CHARACTER PRICING ###
prompt_characters: float = 0,
completion_characters: float = 0,
prompt_characters: int = 0,
completion_characters: int = 0,
### PROMPT CACHING PRICING ### - used for anthropic
cache_creation_input_tokens: Optional[int] = 0,
cache_read_input_tokens: Optional[int] = 0,
### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None,
@ -108,6 +115,16 @@ def cost_per_token(
"""
if model is None:
raise Exception("Invalid arg. Model cannot be none.")
## RECONSTRUCT USAGE BLOCK ##
usage_block = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
cache_read_input_tokens=cache_read_input_tokens,
)
## CUSTOM PRICING ##
response_cost = _cost_per_token_custom_pricing_helper(
prompt_tokens=prompt_tokens,
@ -137,6 +154,7 @@ def cost_per_token(
model_with_provider = model_with_provider_and_region
else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model
model_parts = model.split("/")
if len(model_parts) > 1:
@ -162,6 +180,7 @@ def cost_per_token(
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map")
if custom_llm_provider == "vertex_ai":
cost_router = google_cost_router(
model=model_without_prefix,
@ -188,6 +207,8 @@ def cost_per_token(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif custom_llm_provider == "anthropic":
return anthropic_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "gemini":
return google_cost_per_token(
model=model_without_prefix,
@ -520,6 +541,8 @@ def completion_cost(
prompt_characters = 0
completion_tokens = 0
completion_characters = 0
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
if completion_response is not None and (
isinstance(completion_response, BaseModel)
or isinstance(completion_response, dict)
@ -541,6 +564,13 @@ def completion_cost(
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
cache_creation_input_tokens = completion_response.get("usage", {}).get(
"cache_creation_input_tokens", 0
)
cache_read_input_tokens = completion_response.get("usage", {}).get(
"cache_read_input_tokens", 0
)
total_time = getattr(completion_response, "_response_ms", 0)
verbose_logger.debug(
f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@ -550,7 +580,7 @@ def completion_cost(
)
if hasattr(completion_response, "_hidden_params"):
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", custom_llm_provider or ""
"custom_llm_provider", custom_llm_provider or None
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
@ -697,6 +727,8 @@ def completion_cost(
custom_cost_per_token=custom_cost_per_token,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
cache_creation_input_tokens=cache_creation_input_tokens,
cache_read_input_tokens=cache_read_input_tokens,
call_type=call_type,
)
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar

View file

@ -1,3 +1,7 @@
"""
Calling + translation logic for anthropic's `/v1/messages` endpoint
"""
import copy
import json
import os
@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
from litellm.types.utils import Choices, GenericStreamingChunk
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
from ..base import BaseLLM
from ..prompt_templates.factory import custom_prompt, prompt_factory
class AnthropicConstants(Enum):
@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
)
except Exception as e:
verbose_logger.exception(
"litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
"litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
str(e), messages
)
)

View file

@ -1,3 +1,7 @@
"""
Translation logic for anthropic's `/v1/complete` endpoint
"""
import json
import os
import time
@ -12,8 +16,8 @@ import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
from ..base import BaseLLM
from ..prompt_templates.factory import custom_prompt, prompt_factory
class AnthropicConstants(Enum):

View file

@ -0,0 +1,42 @@
"""
Helper util for handling anthropic-specific cost calculation
- e.g.: prompt caching
"""
from typing import Tuple
from litellm.types.utils import Usage
from litellm.utils import get_model_info
def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- usage: LiteLLM Usage block, containing anthropic caching information
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
"""
## GET MODEL INFO
model_info = get_model_info(model=model, custom_llm_provider="anthropic")
## CALCULATE INPUT COST
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
if model_info.get("cache_creation_input_token_cost") is not None:
prompt_cost += (
usage._cache_creation_input_tokens # type: ignore
* model_info["cache_creation_input_token_cost"]
)
if model_info.get("cache_read_input_token_cost") is not None:
prompt_cost += (
usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"] # type: ignore
)
## CALCULATE OUTPUT COST
completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
return prompt_cost, completion_cost

View file

@ -1,11 +1,14 @@
## This is a template base class to be used for adding new LLM providers via API calls
from typing import Any, Optional, Union
import httpx
import requests
import litellm
import httpx, requests
from typing import Optional, Union
from litellm.litellm_core_utils.litellm_logging import Logging
class BaseLLM:
_client_session: Optional[httpx.Client] = None
def process_response(
@ -14,7 +17,7 @@ class BaseLLM:
response: Union[requests.Response, httpx.Response],
model_response: litellm.utils.ModelResponse,
stream: bool,
logging_obj: Logging,
logging_obj: Any,
optional_params: dict,
api_key: str,
data: Union[dict, str],
@ -33,7 +36,7 @@ class BaseLLM:
response: Union[requests.Response, httpx.Response],
model_response: litellm.utils.TextCompletionResponse,
stream: bool,
logging_obj: Logging,
logging_obj: Any,
optional_params: dict,
api_key: str,
data: Union[dict, str],

View file

@ -267,18 +267,19 @@ def completion(
):
try:
import vertexai
from anthropic import AnthropicVertex
from litellm.llms.anthropic import AnthropicChatCompletion
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexLLM,
)
except:
raise VertexAIError(
status_code=400,
message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
)
from anthropic import AnthropicVertex
from litellm.llms.anthropic.chat import AnthropicChatCompletion
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexLLM,
)
if not (
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
):

View file

@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
from .llms import (
ai21,
aleph_alpha,
anthropic_text,
baseten,
bedrock,
clarifai,
cloudflare,
gemini,
huggingface_restapi,
maritalk,
nlp_cloud,
ollama,
@ -93,13 +90,10 @@ from .llms import (
palm,
petals,
replicate,
together_ai,
triton,
vllm,
watsonx,
)
from .llms.anthropic import AnthropicChatCompletion
from .llms.anthropic_text import AnthropicTextCompletion
from .llms.anthropic.chat import AnthropicChatCompletion
from .llms.anthropic.completion import AnthropicTextCompletion
from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
from .llms.azure_text import AzureTextCompletion
from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM

View file

@ -1336,6 +1336,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.00000125,
"cache_creation_input_token_cost": 0.0000003,
"cache_read_input_token_cost": 0.00000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
@ -1349,6 +1351,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"cache_creation_input_token_cost": 0.00001875,
"cache_read_input_token_cost": 0.0000015,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
@ -1375,6 +1379,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,

View file

@ -1,4 +1,4 @@
model_list:
- model_name: "gemini/*"
- model_name: "gpt-3.5-turbo"
litellm_params:
model: "gemini/*"
model: "gpt-3.5-turbo"

View file

@ -10,7 +10,7 @@ from dotenv import load_dotenv
import litellm.types
import litellm.types.utils
from litellm.llms.anthropic import ModelResponseIterator
from litellm.llms.anthropic.chat import ModelResponseIterator
load_dotenv()
import io
@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
anthropic_chunk_list = [
{"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
{"type": "content_block_delta", "index": 0,
"delta": {"type": "text_delta", "text": " your question about the weather"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
{
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": "To"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " answer"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " your question about the weather"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " in Boston and Los"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " Angeles today, I'll"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " need to"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " use"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " the"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " get_current_weather"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " function"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " for"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " both"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " cities"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": ". Let"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " me fetch"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " that"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " information"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " for"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " you."},
},
{"type": "content_block_stop", "index": 0},
{"type": "content_block_start", "index": 1,
"content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
{
"type": "content_block_start",
"index": 1,
"content_block": {
"type": "tool_use",
"id": "toolu_12345",
"name": "get_current_weather",
"input": {},
},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": ""},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": '{"locat'},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
},
{"type": "content_block_stop", "index": 1},
{"type": "content_block_start", "index": 2,
"content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
{
"type": "content_block_start",
"index": 2,
"content_block": {
"type": "tool_use",
"id": "toolu_023423423",
"name": "get_current_weather",
"input": {},
},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": ""},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": '{"l'},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "oca"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "tio"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "s Angel"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
},
{"type": "content_block_stop", "index": 2},
{"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
"usage": {"output_tokens": 137}},
{"type": "message_stop"}
{
"type": "message_delta",
"delta": {"stop_reason": "tool_use", "stop_sequence": None},
"usage": {"output_tokens": 137},
},
{"type": "message_stop"},
]
@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
correct_tool_index = -1
for chunk in anthropic_chunk_list:
parsed_chunk = response_iter.chunk_parser(chunk)
if tool_use := parsed_chunk.get('tool_use'):
if tool_use := parsed_chunk.get("tool_use"):
# We only increment when a new block starts
if tool_use.get('id') is not None:
if tool_use.get("id") is not None:
correct_tool_index += 1
assert tool_use['index'] == correct_tool_index
assert tool_use["index"] == correct_tool_index
@pytest.mark.asyncio
@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
print(translated_params["messages"])
assert len(translated_params["messages"]) > 0
assert translated_params["messages"][0]["role"] == "user"
assert translated_params["messages"][0]["role"] == "user"

View file

@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
def test_completion_cost_anthropic_prompt_caching():
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
from litellm.utils import Choices, Message, ModelResponse, Usage
model = "anthropic/claude-3-5-sonnet-20240620"
## WRITE TO CACHE ## (MORE EXPENSIVE)
response_1 = ModelResponse(
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
choices=[
Choices(
finish_reason="length",
index=0,
message=Message(
content="Hello! I'm doing well, thank you for",
role="assistant",
tool_calls=None,
function_call=None,
),
)
],
created=1725036547,
model="claude-3-5-sonnet-20240620",
object="chat.completion",
system_fingerprint=None,
usage=Usage(
completion_tokens=10,
prompt_tokens=14,
total_tokens=24,
cache_creation_input_tokens=100,
cache_read_input_tokens=0,
),
)
## READ FROM CACHE ## (LESS EXPENSIVE)
response_2 = ModelResponse(
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
choices=[
Choices(
finish_reason="length",
index=0,
message=Message(
content="Hello! I'm doing well, thank you for",
role="assistant",
tool_calls=None,
function_call=None,
),
)
],
created=1725036547,
model="claude-3-5-sonnet-20240620",
object="chat.completion",
system_fingerprint=None,
usage=Usage(
completion_tokens=10,
prompt_tokens=14,
total_tokens=24,
cache_creation_input_tokens=0,
cache_read_input_tokens=100,
),
)
cost_1 = completion_cost(model=model, completion_response=response_1)
cost_2 = completion_cost(model=model, completion_response=response_2)
assert cost_1 > cost_2

View file

@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_update_cache(
dynamic_rate_limit_handler, mock_response, user_api_key_auth
):

View file

@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
assert len(optional_params) == 0
def test_google_ai_studio_optional_params_embeddings():
optional_params = get_optional_params_embeddings(
user="John",
encoding_format=None,
custom_llm_provider="gemini",
drop_params=True,
)
assert len(optional_params) == 0
def test_openai_optional_params_embeddings():
litellm.drop_params = True
optional_params = get_optional_params_embeddings(

View file

@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
max_input_tokens: Required[Optional[int]]
max_output_tokens: Required[Optional[int]]
input_cost_per_token: Required[float]
cache_creation_input_token_cost: Optional[float]
cache_read_input_token_cost: Optional[float]
input_cost_per_character: Optional[float] # only for vertex ai models
input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models
input_cost_per_character_above_128k_tokens: Optional[
@ -454,6 +456,13 @@ class Choices(OpenAIObject):
class Usage(CompletionUsage):
_cache_creation_input_tokens: int = PrivateAttr(
0
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
_cache_read_input_tokens: int = PrivateAttr(
0
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
def __init__(
self,
prompt_tokens: Optional[int] = None,
@ -466,9 +475,18 @@ class Usage(CompletionUsage):
"completion_tokens": completion_tokens or 0,
"total_tokens": total_tokens or 0,
}
super().__init__(**data)
if "cache_creation_input_tokens" in params and isinstance(
params["cache_creation_input_tokens"], int
):
self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
if "cache_read_input_tokens" in params and isinstance(
params["cache_read_input_tokens"], int
):
self._cache_read_input_tokens = params["cache_read_input_tokens"]
for k, v in params.items():
setattr(self, k, v)

View file

@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
encoding_format=None,
dimensions=None,
custom_llm_provider="",
drop_params: Optional[bool] = None,
additional_drop_params: Optional[bool] = None,
**kwargs,
):
@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
for k, v in special_params.items():
passed_params[k] = v
drop_params = passed_params.pop("drop_params", None)
additional_drop_params = passed_params.pop("additional_drop_params", None)
default_params = {"user": None, "encoding_format": None, "dimensions": None}
@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
for k in non_default_params.keys():
if k not in supported_params:
unsupported_params[k] = non_default_params[k]
if unsupported_params and not litellm.drop_params:
raise UnsupportedParamsError(
status_code=500,
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
)
if unsupported_params:
if litellm.drop_params is True or (
drop_params is not None and drop_params is True
):
pass
else:
raise UnsupportedParamsError(
status_code=500,
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
)
non_default_params = _get_non_default_params(
passed_params=passed_params,
@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
and custom_llm_provider not in litellm.openai_compatible_providers
):
if len(non_default_params.keys()) > 0:
if litellm.drop_params is True: # drop the unsupported non-default values
if (
litellm.drop_params is True or drop_params is True
): # drop the unsupported non-default values
keys = list(non_default_params.keys())
for k in keys:
non_default_params.pop(k, None)
@ -5358,6 +5367,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_input_cost_per_token,
cache_creation_input_token_cost=_model_info.get(
"cache_creation_input_token_cost", None
),
cache_read_input_token_cost=_model_info.get(
"cache_read_input_token_cost", None
),
input_cost_per_character=_model_info.get(
"input_cost_per_character", None
),

View file

@ -1336,6 +1336,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.00000125,
"cache_creation_input_token_cost": 0.0000003,
"cache_read_input_token_cost": 0.00000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
@ -1349,6 +1351,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075,
"cache_creation_input_token_cost": 0.00001875,
"cache_read_input_token_cost": 0.0000015,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
@ -1375,6 +1379,8 @@
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,